library(data.table) # Data table Tools
library(summarytools) # Summary Tools
library(ggplot2) # Plot Visualization Tool
library(data.table) # fread
library(rbokeh) # visualization of data
library(summarytools) # ORIGINALSummary
library(adabag) # bagging and boosting
library(caret) # pre-processing
library(dplyr) # select, mutate_if
library(fastDummies) # dummy_cols
library(splitTools) # data partition
library(rpart) # classification tree
library(rpart.plot) # plot regression trees
library(DT) # datatable
library(corrplot) # corrplot
library(gains) # gain
library(randomForest) # randomForest
library(cluster) # hierarchical clustering
library(knitr) # kable
library(kableExtra) # kbl
library(MASS) # lda, qda, etc.
library(dplyr) # Data Wrangling Tools
library(klaR) # partimat
library(forecast)
library(pROC)
library(tibble)
library(mda) # mda
library(RColorBrewer) # Color Palette
library(tidyverse) # useful Dataframe tools
library(glmnet) # Logistic Lasso Regression
library(car) # VIF
library(ROCR) # ROC Curve
library(neuralnet) # Neural Network
library(nnet) # Neural network
library(factoextra) # K-Means Clustering
library(ggpubr) # ggplot addons
library(GGally) # pairs plots
library (naniar)
# Confusion matrix
draw_confusion_matrix <- function(cm, titleaddon = '') {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title(paste0('CONFUSION MATRIX', ' ', titleaddon), cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(5, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(5, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(23, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(23, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(41, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(41, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(59, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(59, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(77, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(77, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
text(95, 85, names(cm$byClass[8]), cex=1.2, font=2)
text(95, 70, round(as.numeric(cm$byClass[8]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
ORIGINAL <- fread("data/Breast_Cancer/breast-cancer.csv")
# printing sumamry
print(dfSummary(ORIGINAL, valid.col = FALSE, graph.magnif = 0.75, plain.ascii = FALSE, html = TRUE, style ='grid', silent = TRUE), max.tbl.height = 300, width = 80, method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | id [integer] |
|
569 distinct values | 0 (0.0%) | |||||||||||
| 2 | diagnosis [character] |
|
|
0 (0.0%) | |||||||||||
| 3 | radius_mean [numeric] |
|
456 distinct values | 0 (0.0%) | |||||||||||
| 4 | texture_mean [numeric] |
|
479 distinct values | 0 (0.0%) | |||||||||||
| 5 | perimeter_mean [numeric] |
|
522 distinct values | 0 (0.0%) | |||||||||||
| 6 | area_mean [numeric] |
|
539 distinct values | 0 (0.0%) | |||||||||||
| 7 | smoothness_mean [numeric] |
|
474 distinct values | 0 (0.0%) | |||||||||||
| 8 | compactness_mean [numeric] |
|
537 distinct values | 0 (0.0%) | |||||||||||
| 9 | concavity_mean [numeric] |
|
537 distinct values | 0 (0.0%) | |||||||||||
| 10 | concave points_mean [numeric] |
|
542 distinct values | 0 (0.0%) | |||||||||||
| 11 | symmetry_mean [numeric] |
|
432 distinct values | 0 (0.0%) | |||||||||||
| 12 | fractal_dimension_mean [numeric] |
|
499 distinct values | 0 (0.0%) | |||||||||||
| 13 | radius_se [numeric] |
|
540 distinct values | 0 (0.0%) | |||||||||||
| 14 | texture_se [numeric] |
|
519 distinct values | 0 (0.0%) | |||||||||||
| 15 | perimeter_se [numeric] |
|
533 distinct values | 0 (0.0%) | |||||||||||
| 16 | area_se [numeric] |
|
528 distinct values | 0 (0.0%) | |||||||||||
| 17 | smoothness_se [numeric] |
|
547 distinct values | 0 (0.0%) | |||||||||||
| 18 | compactness_se [numeric] |
|
541 distinct values | 0 (0.0%) | |||||||||||
| 19 | concavity_se [numeric] |
|
533 distinct values | 0 (0.0%) | |||||||||||
| 20 | concave points_se [numeric] |
|
507 distinct values | 0 (0.0%) | |||||||||||
| 21 | symmetry_se [numeric] |
|
498 distinct values | 0 (0.0%) | |||||||||||
| 22 | fractal_dimension_se [numeric] |
|
545 distinct values | 0 (0.0%) | |||||||||||
| 23 | radius_worst [numeric] |
|
457 distinct values | 0 (0.0%) | |||||||||||
| 24 | texture_worst [numeric] |
|
511 distinct values | 0 (0.0%) | |||||||||||
| 25 | perimeter_worst [numeric] |
|
514 distinct values | 0 (0.0%) | |||||||||||
| 26 | area_worst [numeric] |
|
544 distinct values | 0 (0.0%) | |||||||||||
| 27 | smoothness_worst [numeric] |
|
411 distinct values | 0 (0.0%) | |||||||||||
| 28 | compactness_worst [numeric] |
|
529 distinct values | 0 (0.0%) | |||||||||||
| 29 | concavity_worst [numeric] |
|
539 distinct values | 0 (0.0%) | |||||||||||
| 30 | concave points_worst [numeric] |
|
492 distinct values | 0 (0.0%) | |||||||||||
| 31 | symmetry_worst [numeric] |
|
500 distinct values | 0 (0.0%) | |||||||||||
| 32 | fractal_dimension_worst [numeric] |
|
535 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.1.3)
2022-12-17
gg_miss_var(ORIGINAL) + ggtitle("NAs")
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## i Please use "none" instead.
## i The deprecated feature was likely used in the naniar package.
## Please report the issue at <]8;;https://github.com/njtierney/naniar/issueshttps://github.com/njtierney/naniar/issues]8;;>.
norm.value <- preProcess(ORIGINAL, method = c("center", "scale"))
ORIGINAL.boxplot <- predict(norm.value, ORIGINAL)
ORIGINAL.boxplot <- melt(dplyr::select(ORIGINAL.boxplot, -c(id)))
## Warning in melt.data.table(dplyr::select(ORIGINAL.boxplot, -c(id))): id.vars
## and measure.vars are internally guessed when both are 'NULL'. All non-numeric/
## integer/logical type columns are considered id.vars, which in this case are
## columns [diagnosis, ...]. Consider providing at least one of 'id' or 'measure'
## vars in future.
library(ggplot2)
ggplot(ORIGINAL.boxplot, aes(x = diagnosis, y = value)) +
facet_wrap(~variable) +
stat_boxplot(geom ='errorbar') +
geom_boxplot()
# histogram over all columns
ggplot(gather(dplyr::select(ORIGINAL, -c(id)), key, value, -diagnosis), aes(value)) +
geom_histogram(bins = 10) +
facet_wrap(~key, scales = 'free')
# histogram over all columns grouped by diagnosis
ggplot(gather(dplyr::select(ORIGINAL, -c(id)), key, value, -diagnosis), aes(value, fill = factor(diagnosis))) +
geom_histogram(aes(y = ..density..), alpha = 0.6, position = "identity") +
facet_wrap(~key, scales = 'free') +
ggtitle("Histogram of predictors seperated by class") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill=guide_legend(title="Diagnosis"))+
scale_fill_discrete(labels=c('benign', 'malignant'))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Color Palette
library("RColorBrewer")
# select numeric variables
Corr_Data <- na.omit(ORIGINAL[,-c(1,2)])
Corr_plot <- cor(Corr_Data)
# Correlations plotting
corrplot(Corr_plot, method = "color", col=brewer.pal(n=8, name="BuGn"),tl.col="black",tl.srt=45, addCoef.col = "black",number.cex = 1)
ggpairs(ORIGINAL, columns = 3:12, aes(color = diagnosis, alpha = 0.5),
diag = list(continuous = "blankDiag"), title = "Pairs plot for Means of Predictors")+
theme(axis.text.x = element_text(angle = 90), axis.title.y.right = element_text(size = 0.4)) # readable x axis
ggpairs(ORIGINAL, columns = 13:22, aes(color = diagnosis, alpha = 0.5),
diag = list(continuous = "blankDiag"), title = "Pairs plot for SE of Predictors")+
theme(axis.text.x = element_text(angle = 90), axis.title.y.right = element_text(size = 0.4)) # readable x axis
ggpairs(ORIGINAL, columns = 23:32, aes(color = diagnosis, alpha = 0.5),
diag = list(continuous = "blankDiag"), title = "Pairs plot for Worsts of Predictors")+
theme(axis.text.x = element_text(angle = 90),axis.title.y.right = element_text(size = 0.4)) # readable x axis
# Transform Character Format to Binary Numerical Values on Outcome Variable "Diagnosis"
ORIGINAL[diagnosis == "M", c("diagnosis")] <- 1 # 1 for Malign Outcome
ORIGINAL[diagnosis == "B", c("diagnosis")] <- 0 # 0 for Benign Outcome
ORIGINAL$diagnosis <- as.factor(ORIGINAL$diagnosis) # To Factor Variable
ORIGINAL ORIGINAL center scale ORIGINAL range
We partition the data into Training (50%), Validation (30%) and Test (20%)
set.seed(1)
# Splitting each Set from the ORIGINAL Dataset
splitting <- sample(1:3,size=nrow(ORIGINAL),replace=TRUE,prob=c(0.5,0.3,0.2))
Training <- ORIGINAL[splitting==1,]
Validation <- ORIGINAL[splitting==2,]
Test <- ORIGINAL[splitting==3,]
# Checking if proportions are right
Prop_Training <- (nrow(Training)/nrow(ORIGINAL))*100
Prop_Validation <- (nrow(Validation)/nrow(ORIGINAL))*100
Prop_Test <- (nrow(Test)/nrow(ORIGINAL))*100
# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")
## [1] "The Proportions are: 52.72 % In Training, 27.94 % In Validation, and 19.33 % In Test"
Assumptions for Logistic Regression:
Type of Logistic Regression:
set.seed(1)
# Duplicate the Training and Validation Set
Training_Logistic <- Training
Validation_Logistic <- Validation
# Remove the "ID" Variable
Training_Logistic <- Training_Logistic[,-c("id")]
Validation_Logistic <- Validation_Logistic[,-c("id")]
set.seed(1)
# Fit The Logistic Regression Model
Logistic_Model_1 <- glm(diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + compactness_se + concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se, family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_1)
##
## Call:
## glm(formula = diagnosis ~ radius_mean + texture_mean + perimeter_mean +
## area_mean + smoothness_mean + compactness_mean + concavity_mean +
## `concave points_mean` + symmetry_mean + fractal_dimension_mean +
## radius_se + texture_se + perimeter_se + area_se + smoothness_se +
## compactness_se + concavity_se + `concave points_se` + symmetry_se +
## fractal_dimension_se, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.317 0.000 0.000 0.000 2.136
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -360.60177 545.86789 -0.661 0.509
## radius_mean 123.35635 193.50517 0.637 0.524
## texture_mean 5.41398 7.08341 0.764 0.445
## perimeter_mean -20.56072 29.99195 -0.686 0.493
## area_mean 0.08655 0.18712 0.463 0.644
## smoothness_mean -2341.41264 3334.91457 -0.702 0.483
## compactness_mean 379.21208 569.93197 0.665 0.506
## concavity_mean 943.41790 1350.82135 0.698 0.485
## `concave points_mean` 1504.32982 2010.64280 0.748 0.454
## symmetry_mean 773.95029 1002.93720 0.772 0.440
## fractal_dimension_mean 5851.30425 8045.54675 0.727 0.467
## radius_se -76.50918 230.85247 -0.331 0.740
## texture_se -34.99463 45.23880 -0.774 0.439
## perimeter_se 14.70440 27.64997 0.532 0.595
## area_se 2.37526 3.70090 0.642 0.521
## smoothness_se -4663.05813 6377.73774 -0.731 0.465
## compactness_se -2567.06626 3467.70359 -0.740 0.459
## concavity_se -204.44834 667.66965 -0.306 0.759
## `concave points_se` 1708.06481 2562.64576 0.667 0.505
## symmetry_se -1087.68610 1385.67389 -0.785 0.432
## fractal_dimension_se -18817.19739 25354.48076 -0.742 0.458
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 20.026 on 279 degrees of freedom
## AIC: 62.026
##
## Number of Fisher Scoring iterations: 18
Comments: Trying to fit every variable in our logistic regression showed an convergence error when there is complete separation. To deal with this we should use a penalized model because of too many variables included in our model compared to the number of observations. (See [Convergence Error in Logistic Regression] and [Penalized Logistic Regression Essentials in R: Ridge, Lasso and Elastic Net] in References)
There is 3 differents methods when it comes to Penalized Logistic Regression Model:
For this case, we will use a Lasso Regression Model being way more strict in attributing less to no weight to variables not significant enough.
# Required Packages
library(tidyverse)
library(caret)
library(glmnet)
# Setting Seed
set.seed(1)
# Define response variable
y_lasso <- as.numeric(Training_Logistic$diagnosis)
# Define matrix of predictor variables
x_lasso <- data.matrix(Training_Logistic[,-c("diagnosis")])
# Perform k-fold cross-validation to find optimal lambda value - alpha = 1 is for using Lasso Method
cv_model <- cv.glmnet(x_lasso, y_lasso, alpha = 1)
# Find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
print(paste("Best Lambda is equal to",best_lambda))
## [1] "Best Lambda is equal to 0.000578270146672675"
# Produce plot of test MSE by lambda value
plot(cv_model)
Comments: We want to use the lowest MSE and thus find the optimal Lambda.
set.seed(1)
# Use optimal lambda value and alpha = 1 is for using Lasso Method
Logistic_Lasso_Optimal <- glmnet(x_lasso, y_lasso, alpha = 1, lambda = best_lambda)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
Logistic_Lasso_Optimal$beta
## 30 x 1 sparse Matrix of class "dgCMatrix"
## s0
## radius_mean .
## texture_mean 0.0092460114
## perimeter_mean 0.0007441971
## area_mean .
## smoothness_mean -0.6726642078
## compactness_mean -3.0655109153
## concavity_mean 2.3768884384
## concave points_mean .
## symmetry_mean 0.6146556627
## fractal_dimension_mean -5.5409126922
## radius_se 0.3947352904
## texture_se 0.0125006149
## perimeter_se 0.0175743608
## area_se -0.0026599944
## smoothness_se 2.4042915125
## compactness_se -2.1473061382
## concavity_se -3.0067441349
## concave points_se 8.7579605809
## symmetry_se 6.9831540153
## fractal_dimension_se -7.8748193831
## radius_worst 0.0873539874
## texture_worst .
## perimeter_worst 0.0010958466
## area_worst -0.0004789178
## smoothness_worst 2.8169401453
## compactness_worst 0.2083513325
## concavity_worst 0.0294152220
## concave points_worst 1.3314761934
## symmetry_worst .
## fractal_dimension_worst 5.2951497825
Comments: We can see that our Logistic Model has shrunk some variables to 0, this can be expected when using Lasso Regression, since it will get rid of unsignificant variables completely instead of setting a very low coefficient.
set.seed(1)
# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After <- glm(diagnosis ~ area_mean + smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se + radius_worst + texture_worst + area_worst + concavity_worst + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After)
##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + compactness_mean +
## concavity_mean + `concave points_mean` + symmetry_mean +
## fractal_dimension_mean + radius_se + texture_se + perimeter_se +
## area_se + smoothness_se + concavity_se + `concave points_se` +
## symmetry_se + fractal_dimension_se + radius_worst + texture_worst +
## area_worst + concavity_worst + `concave points_worst` + symmetry_worst +
## fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.000091587 -0.000000021 -0.000000021 0.000000021 0.000095973
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1036.5304 2846587.3158 0.000 1.000
## area_mean -0.1696 1462.2833 0.000 1.000
## smoothness_mean 1450.2601 4192046.3648 0.000 1.000
## compactness_mean -1731.9712 1906426.5985 -0.001 0.999
## concavity_mean 1469.0815 3711329.7662 0.000 1.000
## `concave points_mean` 1070.0724 4578141.0180 0.000 1.000
## symmetry_mean -165.2926 2286527.5901 0.000 1.000
## fractal_dimension_mean -437.0810 14827275.0943 0.000 1.000
## radius_se -7.1344 886865.7546 0.000 1.000
## texture_se -26.0697 118646.6300 0.000 1.000
## perimeter_se 17.0820 55798.5434 0.000 1.000
## area_se 1.1568 9851.6304 0.000 1.000
## smoothness_se -10007.8391 23596748.0400 0.000 1.000
## concavity_se -2628.1095 7354098.2654 0.000 1.000
## `concave points_se` 25297.8713 16023563.8416 0.002 0.999
## symmetry_se -931.2103 8047236.1089 0.000 1.000
## fractal_dimension_se -54837.6628 30037108.6303 -0.002 0.999
## radius_worst 19.9328 175061.2117 0.000 1.000
## texture_worst 6.0563 5777.3810 0.001 0.999
## area_worst 0.1061 2460.1674 0.000 1.000
## concavity_worst 112.5699 552215.1283 0.000 1.000
## `concave points_worst` -1639.6759 2321009.5161 -0.001 0.999
## symmetry_worst 260.3450 802972.6305 0.000 1.000
## fractal_dimension_worst 6959.5789 5101143.4606 0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187196953955 on 299 degrees of freedom
## Residual deviance: 0.000000089786 on 276 degrees of freedom
## AIC: 48
##
## Number of Fisher Scoring iterations: 25
Comments: Eventhough we did a Lasso Regression, we can see that our standard Logistic Regression fails to converge with this selection of variables. This is could be due to the number of variables, 23 in our case and the low number of observations. Now it’s the time to check our VIF, since Multicolinearity could be our main source of problem.
We need to compute multiple stage when removing our multicolinear Variables, let’s see when we don’t have anymore problem of multicolinearity.
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values <- vif(Logistic_Model_After)
# Create horizontal bar chart to display each VIF value
barplot(vif_values, main = "VIF Values - First Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)
# Call VIF Values
vif_values
## area_mean smoothness_mean compactness_mean
## 12711.7372 474.1363 658.4854
## concavity_mean `concave points_mean` symmetry_mean
## 5543.8846 1826.2882 387.8132
## fractal_dimension_mean radius_se texture_se
## 836.5596 3823.2095 637.7963
## perimeter_se area_se smoothness_se
## 603.9070 9076.0715 736.7649
## concavity_se `concave points_se` symmetry_se
## 2048.6454 395.2401 291.1494
## fractal_dimension_se radius_worst texture_worst
## 236.8928 23366.1249 144.7868
## area_worst concavity_worst `concave points_worst`
## 52809.4217 611.4157 691.8691
## symmetry_worst fractal_dimension_worst
## 210.7082 519.5269
Comments: We can see that most of our variables have multicolinearity (with VIF over 5). We need to remove variables with the highest VIF first. We can start by removing at least 8 variables: radius_worst, area_worst, concavity_worst, concavity_mean, concavity_worst, fractal_dimension_se, concavity_se and concave points_mean.
set.seed(1)
# Fit The Logistic Regression Model
Logistic_Model_After_VIF1 <- glm(diagnosis ~ area_mean + smoothness_mean + compactness_mean + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + `concave points_se` + symmetry_se + texture_worst + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF1)
##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + compactness_mean +
## symmetry_mean + fractal_dimension_mean + radius_se + texture_se +
## perimeter_se + area_se + smoothness_se + `concave points_se` +
## symmetry_se + texture_worst + `concave points_worst` + symmetry_worst +
## fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.66215 -0.02679 -0.00130 0.00007 2.74659
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -28.101214 21.470757 -1.309 0.1906
## area_mean 0.005558 0.012446 0.447 0.6552
## smoothness_mean 95.448428 87.774297 1.087 0.2768
## compactness_mean -11.799465 43.355986 -0.272 0.7855
## symmetry_mean -50.291312 44.623905 -1.127 0.2597
## fractal_dimension_mean -170.084912 282.240446 -0.603 0.5468
## radius_se 16.946697 41.812049 0.405 0.6853
## texture_se -0.751951 2.176595 -0.345 0.7297
## perimeter_se -0.005054 1.670765 -0.003 0.9976
## area_se 0.054588 0.459390 0.119 0.9054
## smoothness_se 145.423653 413.481380 0.352 0.7251
## `concave points_se` -111.604244 340.577154 -0.328 0.7431
## symmetry_se -86.243828 166.526520 -0.518 0.6045
## texture_worst 0.412596 0.222259 1.856 0.0634 .
## `concave points_worst` 96.771896 44.386229 2.180 0.0292 *
## symmetry_worst 31.814364 23.021997 1.382 0.1670
## fractal_dimension_worst -11.018661 114.043706 -0.097 0.9230
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 31.902 on 283 degrees of freedom
## AIC: 65.902
##
## Number of Fisher Scoring iterations: 11
Comments: Even after removing 8 variables, our model still suffer from Convergence error, let’s check our VIF again.
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_2 <- vif(Logistic_Model_After_VIF1)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_2, main = "VIF Values - Second Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)
# Call VIF Values
vif_values_2
## area_mean smoothness_mean compactness_mean
## 15.595581 6.145892 10.148517
## symmetry_mean fractal_dimension_mean radius_se
## 5.027208 13.100655 153.870701
## texture_se perimeter_se area_se
## 5.769376 6.997751 151.905108
## smoothness_se `concave points_se` symmetry_se
## 4.069325 12.270014 8.053918
## texture_worst `concave points_worst` symmetry_worst
## 6.419859 7.861339 10.357040
## fractal_dimension_worst
## 11.174660
Comments: We can still see high VIF values in our variables, let’s remove 6 variables again: radius_se, perimeter_se, area_se, compactness_mean, texture_se and texture_worst.
set.seed(1)
# Fit The Logistic Regression Model
Logistic_Model_After_VIF2 <- glm(diagnosis ~ area_mean + smoothness_mean + symmetry_mean + fractal_dimension_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF2)
##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean +
## fractal_dimension_mean + smoothness_se + `concave points_se` +
## symmetry_se + `concave points_worst` + symmetry_worst + fractal_dimension_worst,
## family = binomial(link = "logit"), data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6820 -0.0722 -0.0158 0.0093 4.3901
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.896503 7.552690 -1.972 0.048570 *
## area_mean 0.012886 0.003772 3.416 0.000634 ***
## smoothness_mean 48.131843 55.833555 0.862 0.388655
## symmetry_mean -7.054654 30.331490 -0.233 0.816084
## fractal_dimension_mean -261.372561 186.237770 -1.403 0.160487
## smoothness_se 514.036777 280.214593 1.834 0.066589 .
## `concave points_se` -139.375305 216.104775 -0.645 0.518964
## symmetry_se 30.505299 117.585842 0.259 0.795303
## `concave points_worst` 59.599022 23.907022 2.493 0.012669 *
## symmetry_worst 12.023580 16.496592 0.729 0.466092
## fractal_dimension_worst 64.714978 62.539298 1.035 0.300767
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 51.151 on 289 degrees of freedom
## AIC: 73.151
##
## Number of Fisher Scoring iterations: 9
Comments: Now we can see that our Regression Model Converge, let’s compute a third iteration of VIF to check the multicolinearity.
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_3 <- vif(Logistic_Model_After_VIF2)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_3, main = "VIF Values - Third Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)
# Call VIF Values
vif_values_3
## area_mean smoothness_mean symmetry_mean
## 2.036432 3.309360 3.734544
## fractal_dimension_mean smoothness_se `concave points_se`
## 8.371764 2.751976 5.868334
## symmetry_se `concave points_worst` symmetry_worst
## 5.369938 3.926850 7.396930
## fractal_dimension_worst
## 6.413996
Comments: We are indeed improved our VIF model by excluding a lot of multicolinear variables, we can still see 3 variables suffering from a VIF higher than 5, let’s remove fractal_dimension_mean and see if it improves everything.
set.seed(1)
# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After_VIF3 <- glm(diagnosis ~ area_mean + smoothness_mean + symmetry_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF3)
##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean +
## smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` +
## symmetry_worst + fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5703 -0.0904 -0.0212 0.0091 4.2589
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.569221 5.766024 -3.741 0.000183 ***
## area_mean 0.014251 0.003528 4.039 0.0000536 ***
## smoothness_mean 2.822523 42.528827 0.066 0.947085
## symmetry_mean -8.237752 28.629721 -0.288 0.773550
## smoothness_se 544.620641 260.927403 2.087 0.036866 *
## `concave points_se` -184.081058 189.735324 -0.970 0.331947
## symmetry_se -2.199196 104.630291 -0.021 0.983231
## `concave points_worst` 64.859936 22.683560 2.859 0.004245 **
## symmetry_worst 12.776311 14.687096 0.870 0.384355
## fractal_dimension_worst -2.007970 38.811877 -0.052 0.958739
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 53.299 on 290 degrees of freedom
## AIC: 73.299
##
## Number of Fisher Scoring iterations: 9
Comments: Now we can see that our Regression Model Converge, let’s compute a fourth iteration of VIF to check the multicolinearity.
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_4 <- vif(Logistic_Model_After_VIF3)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_4, main = "VIF Values - Fourth Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)
# Call VIF Values
vif_values_4
## area_mean smoothness_mean symmetry_mean
## 1.974465 2.170228 3.393934
## smoothness_se `concave points_se` symmetry_se
## 2.506280 5.034108 4.507909
## `concave points_worst` symmetry_worst fractal_dimension_worst
## 3.656527 6.429923 2.915146
Comments: We can see that our highest VIF values come from symmetry_worst, we can remove it and check if our model is now free of multicolinearity issues.
set.seed(1)
# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After_VIF4 <- glm(diagnosis ~ area_mean + smoothness_mean + symmetry_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF4)
##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean +
## smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` +
## fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6582 -0.0909 -0.0251 0.0086 4.2428
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.031832 5.780075 -3.639 0.000274 ***
## area_mean 0.014467 0.003604 4.014 0.0000597 ***
## smoothness_mean -6.948246 39.846117 -0.174 0.861569
## symmetry_mean 6.581404 23.217996 0.283 0.776823
## smoothness_se 483.403351 217.848490 2.219 0.026487 *
## `concave points_se` -273.861042 131.451779 -2.083 0.037219 *
## symmetry_se 61.479884 66.461799 0.925 0.354945
## `concave points_worst` 71.119350 21.538479 3.302 0.000960 ***
## fractal_dimension_worst 8.358018 35.734350 0.234 0.815068
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 54.198 on 291 degrees of freedom
## AIC: 72.198
##
## Number of Fisher Scoring iterations: 9
Comments: We improved some significance level removing this high VIF variable.
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_5<- vif(Logistic_Model_After_VIF4)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_5, main = "VIF Values - Fifth Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)
# Call VIF Values
vif_values_5
## area_mean smoothness_mean symmetry_mean
## 2.054457 1.987622 2.246336
## smoothness_se `concave points_se` symmetry_se
## 2.286389 2.614686 2.244157
## `concave points_worst` fractal_dimension_worst
## 3.857056 2.524969
Comments: Now we can see that all our selected variables are not subject to multicolinearity anymore. Let’s use this model to compute some predictions.
set.seed(1)
# Predictions with LR
Logistic_Lasso_Predictions <- predict(Logistic_Model_After_VIF4, Validation_Logistic[,c("area_mean", "smoothness_mean", "symmetry_mean", "smoothness_se", "concave points_se", "symmetry_se", "concave points_worst", "fractal_dimension_worst")], type = "response")
# Rounding Predictions - 0.5 Threshold
Logistic_Lasso_Predictions_Dummy <- round(Logistic_Lasso_Predictions)
# As Numeric
Logistic_Lasso_Predictions_Dummy <- as.numeric(Logistic_Lasso_Predictions_Dummy)
# Check rounding in a Dataframe
DF_Logistic_Lasso_Predictions <- cbind(Logistic_Lasso_Predictions, Logistic_Lasso_Predictions_Dummy)
# As Factor
Logistic_Lasso_Predictions_Dummy <- as.factor(Logistic_Lasso_Predictions_Dummy)
# Confusion Matrix
Confusion_Matrix_Logistic_Lasso <- confusionMatrix(data = Logistic_Lasso_Predictions_Dummy, reference = Validation_Logistic$diagnosis, positive = "1")
# Create the Function for Confusion Matrix
draw_confusion_matrix_Logistic_Lasso <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Logistic Regression - Validation', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_Logistic_Lasso(Confusion_Matrix_Logistic_Lasso)
Comments:
set.seed(1)
# Load ROCR Package
library(ROCR)
# Plot our ROC Curve
pr <- ROCR::prediction(Logistic_Lasso_Predictions, Validation_Logistic$diagnosis)
prf <- ROCR::performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf, main="ROC for Validation Set")
abline(a = 0, b = 1)
Comments:
# Confusion Best
Best_Logistic_Confusion <- Confusion_Matrix_Logistic_Lasso
# Predictions Best
Best_Logistic_Predictions_Dummy <- Logistic_Lasso_Predictions_Dummy
Best_Logistic_Predictions_Dummy <- factor(Best_Logistic_Predictions_Dummy)
Best_Logistic_Predictions_Probabilities <- Logistic_Lasso_Predictions
# Best Predictions as Data frame
DF_Best_Logistic_Predictions <- data.frame(Best_Logistic_Predictions_Dummy, Best_Logistic_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_Logistic_Confusion <- data.frame(c(Best_Logistic_Confusion$byClass[c(1,2)], Best_Logistic_Confusion$overall[1]))
colnames(DF_Best_Logistic_Confusion) <- "Best Logistic Lasso Regression"
DF_Best_Logistic_Confusion <- t(DF_Best_Logistic_Confusion) # FINAL CONFUSION DATAFRAME
#Re-name the partitions in the data
Training_M <- Training
Validation_M <- Validation
Test_M <- Test
#Take out the id column
Training_M <- data.frame(Training_M[,-c(1)])
Validation_M <- data.frame(Validation_M[,-c(1)])
Test_M<- data.frame(Test_M[, -c(1)])
# Checking if proportions are right
Prop_Training <- (nrow(Training_M)/nrow(ORIGINAL))*100
Prop_Validation <- (nrow(Validation_M)/nrow(ORIGINAL))*100
Prop_Test <- (nrow(Test_M)/nrow(ORIGINAL))*100
# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")
## [1] "The Proportions are: 52.72 % In Training, 27.94 % In Validation, and 19.33 % In Test"
The outcome variable is a binary factor, we model a classification tree. We first run a deep tree, with all the features included. Then proceed to reduce the size of the deeper tree through pruning.
set.seed(1)
options(scipen=999)
tree_full <- rpart(diagnosis ~ .,
data = Training_M,
method = "class", # "class" because Y is a binary factor
minbucket = 1,
cp = 0.00001)
# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)
length(tree_full$frame$var[tree_full$frame$var == "<leaf>"]) # End nodes
## [1] 9
relevance<-as.data.frame(tree_full$variable.importance) #we get the ranking of the variables by importance
kable(relevance, row.names = T,col.names="Variable Importance")%>% kable_paper("hover", full_width = T) #built table
| Variable Importance | |
|---|---|
| perimeter_worst | 107.1041853 |
| radius_worst | 104.4460423 |
| area_worst | 103.3104820 |
| radius_mean | 96.4545551 |
| perimeter_mean | 94.4386002 |
| area_mean | 93.1159458 |
| concave.points_worst | 16.8085026 |
| compactness_worst | 7.2225217 |
| symmetry_worst | 7.2225217 |
| concave.points_mean | 6.9755085 |
| concavity_worst | 5.7780173 |
| texture_mean | 5.2906178 |
| concavity_mean | 5.1375986 |
| texture_worst | 3.9679634 |
| fractal_dimension_mean | 2.6453089 |
| fractal_dimension_worst | 2.6453089 |
| smoothness_mean | 1.9784946 |
| texture_se | 1.4970760 |
| compactness_mean | 1.0338243 |
| concave.points_se | 0.8040856 |
| smoothness_se | 0.7485380 |
| compactness_se | 0.5743468 |
printcp(tree_full, digits = 6) # print complexity value
##
## Classification tree:
## rpart(formula = diagnosis ~ ., data = Training_M, method = "class",
## minbucket = 1, cp = 0.00001)
##
## Variables actually used in tree construction:
## [1] concave.points_mean concave.points_worst perimeter_worst
## [4] radius_mean radius_worst smoothness_mean
## [7] texture_mean
##
## Root node error: 109/300 = 0.363333
##
## n= 300
##
## CP nsplit rel error xerror xstd
## 1 0.83486239 0 1.00000000 1.000000 0.0764263
## 2 0.08256881 1 0.16513761 0.266055 0.0469566
## 3 0.01834862 2 0.08256881 0.174312 0.0387028
## 4 0.00917431 4 0.04587156 0.165138 0.0377375
## 5 0.00001000 8 0.00917431 0.192661 0.0405438
plotcp(tree_full, upper = "splits") # we plot the progression of complexity values
#Prune the tree
min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"] # find the corresponding CP value, to get the "best pruned " tree
pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp
rpart.plot(pruned_tree, yesno = TRUE, digits =-3)
length(pruned_tree$frame$var[pruned_tree$frame$var == "<leaf>"]) # how many end nodes
## [1] 5
The fully grown tree is quite reduced in size, we still pruned the tree.
# classification prediction over validation data
pruned_pred <- predict(pruned_tree, Validation_M, type = "class")
pruned_prob <- predict(pruned_tree, Validation_M, type = "prob") # probabilities of belonging to 1
# confusion matrix and accuracy of classification tree
tree_cf<- confusionMatrix(pruned_pred, Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(tree_cf)
Sensitivity lower than Specificity ( Malign diagnosis is minority). Accuracy pretty high.
#ROC curve
ROC_df <- data.frame(Validation_M[,1], pruned_pred)
ROC_df[,1]<- as.numeric(as.character(ROC_df[,1]))
ROC_df$pruned_pred<- as.numeric(as.character(ROC_df$pruned_pred))
roc_score <- roc(data= ROC_df , response=Validation_M...1., pruned_pred) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")
As K nearest neighbour is based on distances the predictors need to be standardized such that they have a mean at 0 and a variance equal to 1. This necessity of standardizinh is due to the fact that otherwise variables with bigger values have more influence on the distance that is being calculated.
# partition
ORIGINAL.KNN.train <- dplyr::select(Training, -c(id))
ORIGINAL.KNN.valid <- dplyr::select(Validation, -c(id))
ORIGINAL.KNN.test <- dplyr::select(Test, -c(id))
# standardize
norm.value <- preProcess(ORIGINAL.KNN.train, method = c("center", "scale"))
ORIGINAL.KNN.train <- predict(norm.value, ORIGINAL.KNN.train)
ORIGINAL.KNN.valid <- predict(norm.value, ORIGINAL.KNN.valid)
ORIGINAL.KNN.test <- predict(norm.value, ORIGINAL.KNN.test)
To get the best k one might iterate over severall k and choose the one which has the highest value for the metric that is being looked at. In this case we looked at either accuracy or sensitivity.
set.seed(1)
accuracy.df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30))
sensitivity.df <- data.frame(k = seq(1, 30, 1), sensitivtiy = rep(0, 30))
# iterating over different ks
for(i in 1:30){
# nearest neighbor
KNN1 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = i)
# predictions response
KNN1.pred.valid.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
# predictions prob
KNN1.pred.valid.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix
sensitivity.df[i, 2] <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")$byClass[1]
accuracy.df[i, 2] <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")$overall[1]
}
# table in markdown
datatable(accuracy.df)
datatable(sensitivity.df)
# plot the ks
ggplot(accuracy.df) +
aes(x = k, y = accuracy) +
geom_line(size = 0.7, colour = "#112646") +
labs(x = "Number of k nearest neighbours",
y = "Accuracy", title = "Accuracy regarding k") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
ggplot(sensitivity.df) +
aes(x = k, y = sensitivtiy) +
geom_line(size = 0.7, colour = "#112646") +
labs(x = "Number of k nearest neighbours",
y = "Sensitivtiy", title = "Sensitivity regarding k") +
theme_minimal()
From the output we can see that the best k is either a 6 or 1. For accuracy the best k is at 6 while for sensitivity it is at 1 with the second best being at 6. Down below the code for KNN with k=6 and k=1.
set.seed(1)
# nearest neighbor
KNN1 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = 6)
# predictions response
KNN1.pred.valid.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
# predictions prob
KNN1.pred.valid.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix Validation
KNN1.conf.mat <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")
draw_confusion_matrix(KNN1.conf.mat, titleaddon = "KNN with k=6")
When looking at the output we see that 6 nearest neighbors doesn’t deliver a bad model but it lacks sensitivity which is very important in the case of classifying cancer.
Here we see the model with 1 nearest neighbor.
set.seed(1)
# nearest neighbor
KNN2 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = 1)
# predictions response
KNN2.pred.valid.resp <- predict(KNN2, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
# predictions prob
KNN2.pred.valid.prob <- predict(KNN2, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix Validation
KNN2.conf.mat <- confusionMatrix(KNN2.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")
draw_confusion_matrix(KNN2.conf.mat, titleaddon = 'KNN')
We see that this model with 1 nearest neighbor is overall a little better than the model with k equal to 6 and is better in sensitivity. But we will not continue with this model as it isn’t much better than the one with k equal to 6 and in KNN only one neighbor implies overfitting. As we do not want to run this risk we continue with k equal to 6.
Down below we print out again the best model.
# Confusion Best
Best_KNN_Confusion <- KNN1.conf.mat
# Predictions Best
Best_KNN_Predictions_Dummy <- KNN1.pred.valid.resp
Best_KNN_Predictions_Dummy <- factor(Best_KNN_Predictions_Dummy)
Best_KNN_Predictions_Probabilities <- KNN1.pred.valid.prob
# Best Predictions as Data frame
DF_Best_KNN_Predictions <- data.frame(Best_KNN_Predictions_Dummy, Best_KNN_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_KNN_Confusion <- data.frame(c(Best_KNN_Confusion$byClass[c(1,2)], Best_KNN_Confusion$overall[1]))
colnames(DF_Best_KNN_Confusion) <- "Best KNN"
DF_Best_KNN_Confusion <- t(DF_Best_KNN_Confusion) # FINAL CONFUSION DATAFRAME
set.seed(1)
# Duplicate the Training and Validation Set
Training_NN <- Training
Validation_NN <- Validation
Test_NN <- Test
# Make Sure to be as Dataframe
Training_NN <- data.frame(Training_NN)
Validation_NN <- data.frame(Validation_NN)
Test_NN <- data.frame(Test_NN)
# Remove the "ID" Variable
Training_NN <- Training_NN[,-1]
Validation_NN <- Validation_NN[,-1]
Test_NN <- Test_NN[,-1]
# Preprocess Data
Norm_NN <- preProcess(Training_NN, method = c("center", "scale"))
Training_NN_Preprocess <- predict(Norm_NN, Training_NN)
Validation_NN_Preprocess <- predict(Norm_NN, Validation_NN)
Test_NN_Preprocess <- predict(Norm_NN, Test_NN)
# Confusion Best
Best_Neural_Network_Confusion <- Confusion_Matrix_Neural_2
# Predictions Best
Best_Neural_Network_Predictions_Dummy <- Predictions_NN2_Dummy
Best_Neural_Network_Predictions_Dummy <- factor(Best_Neural_Network_Predictions_Dummy)
Best_Neural_Network_Predictions_Probabilities <- Predictions_NN2_Probabilities
# Best Predictions as Data frame
DF_Best_Neural_Network_Predictions <- data.frame(Best_Neural_Network_Predictions_Dummy, Best_Neural_Network_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_Neural_Confusion <- data.frame(c(Best_Neural_Network_Confusion$byClass[c(1,2)], Best_Neural_Network_Confusion$overall[1]))
colnames(DF_Best_Neural_Confusion) <- "Best Neural Network"
DF_Best_Neural_Confusion <- t(DF_Best_Neural_Confusion) # FINAL CONFUSION DATAFRAME
To run discriminant analysis the data needs to be centered and scaled. As again otherwise larger values might have abigger influence.
set.seed(1)
# partition
ORIGINAL.DA.train <- dplyr::select(Training, -c(id))
ORIGINAL.DA.valid <- dplyr::select(Validation, -c(id))
ORIGINAL.DA.test <- dplyr::select(Test, -c(id))
# standardize
norm.value <- preProcess(ORIGINAL.DA.train, method = c("center", "scale"))
ORIGINAL.DA.train <- predict(norm.value, ORIGINAL.DA.train)
ORIGINAL.DA.valid <- predict(norm.value, ORIGINAL.DA.valid)
ORIGINAL.DA.test <- predict(norm.value, ORIGINAL.DA.test)
First we run a linear discriminant analysis.
set.seed(1)
# Fit the model
DA1 <- lda(diagnosis~., data = ORIGINAL.DA.train)
# Make predictions
predictions <- predict(DA1, ORIGINAL.DA.valid)
# predictions prob
DA1.pred.valid.prob <- predictions$posterior[,2]
# predictions response
DA1.pred.valid.resp <- factor(predictions$class)
# confusion matrix
DA1.conf.mat <- confusionMatrix(DA1.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA1.conf.mat, titleaddon = 'Discriminant Analysis')
# Evaluating LDA
# not run because it plots a lot of graphs
# partimat(diagnosis~., data = data.frame(ORIGINAL.DA.train), method="lda", mar=c(0.5, 0.5, 0.5, 0.5))
Not a bad model but again it lacks sensitivity. As the model is quite big we can try a variable selection.
set.seed(1)
modelstepL <- stepclass(diagnosis ~ ., "lda", direction = "backward", data = data.frame(ORIGINAL.DA.train))
## `stepwise classification', using 10-fold cross-validated correctness rate of method lda'.
## 300 observations of 30 variables in 2 classes; direction: backward
## stop criterion: improvement less than 5%.
## correctness rate: 0.94667; starting variables (30): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.96333; out: "radius_worst"; variables (29): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
##
## hr.elapsed min.elapsed sec.elapsed
## 0.00 0.00 3.22
DA1.sel <- lda(diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean +
smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` +
symmetry_mean + fractal_dimension_mean + radius_se + texture_se +
perimeter_se + area_se + smoothness_se + compactness_se +
concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se +
texture_worst + perimeter_worst + area_worst + smoothness_worst +
compactness_worst + concavity_worst + `concave points_worst` +
symmetry_worst + fractal_dimension_worst, data = ORIGINAL.DA.train)
# Make predictions
predictions.sel <- predict(DA1.sel, ORIGINAL.DA.valid)
# predictions prob
DA1.sel.pred.valid.prob <- predictions.sel$posterior[,2]
# predictions response
DA1.sel.pred.valid.resp <- factor(predictions.sel$class)
# confusion matrix
DA1.sel.conf.mat <- confusionMatrix(DA1.sel.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA1.sel.conf.mat, titleaddon = 'Discriminant Analysis')
We see that the model only drops one variable and the predictive power of the model doesn’t change much as instead of 1 false positive and 7 false negatives there are now 0 false positives and 8 false negatives.
As we have seen in the data anaylsis there is some correlation in the data why we can try to run a quadratic discriminant analysis.
set.seed(1)
# Fit the model
DA2 <- qda(diagnosis ~., data = ORIGINAL.DA.train)
# Make predictions
predictions <- predict(DA2, ORIGINAL.DA.valid)
# predictions prob
DA2.pred.valid.prob <- predictions$posterior[,2]
# predictions response
DA2.pred.valid.resp <- factor(predictions$class)
# confusion matrix
DA2.conf.mat <- confusionMatrix(DA2.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA2.conf.mat, titleaddon = 'Quadratic Discriminant Analysis')
Overall this model is worse than the lda but i case of sensitivtiy it is better as there are only 4 false negatives.
set.seed(1)
modelstepL <- stepclass(diagnosis ~ ., "qda", direction = "backward", data = data.frame(ORIGINAL.DA.train))
## `stepwise classification', using 10-fold cross-validated correctness rate of method qda'.
## 300 observations of 30 variables in 2 classes; direction: backward
## stop criterion: improvement less than 5%.
## correctness rate: 0.94667; starting variables (30): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.95333; out: "area_mean"; variables (29): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.96333; out: "concavity_worst"; variables (28): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.96667; out: "perimeter_worst"; variables (27): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.97; out: "perimeter_mean"; variables (26): radius_mean, texture_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.97667; out: "smoothness_mean"; variables (25): radius_mean, texture_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.98; out: "smoothness_se"; variables (24): radius_mean, texture_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
##
## hr.elapsed min.elapsed sec.elapsed
## 0.00 0.00 7.75
DA2.sel <- qda(diagnosis ~ radius_mean + texture_mean + compactness_mean + concavity_mean +
`concave points_mean` + symmetry_mean + fractal_dimension_mean +
radius_se + texture_se + perimeter_se + area_se + compactness_se +
concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se +
radius_worst + texture_worst + area_worst + smoothness_worst +
compactness_worst + `concave points_worst` + symmetry_worst +
fractal_dimension_worst, data = ORIGINAL.DA.train)
# Make predictions
predictions <- predict(DA2.sel, ORIGINAL.DA.valid)
# predictions prob
DA2.sel.pred.valid.prob <- predictions$posterior[,2]
# predictions response
DA2.sel.pred.valid.resp <- factor(predictions$class)
# confusion matrix
DA2.sel.conf.mat <- confusionMatrix(DA2.sel.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA2.sel.conf.mat, titleaddon = 'Quadratic Discriminant Analysis')
This time around 6 variables were dropped but we see that the model is not better than the qda with all predictors.
There are several more discriminant analysis methods that can be applied. Down below we tried Mixture discriminant analysis (MDA) which often outperforms QDA and LDA because the assumptions for the distributions of the classes are loser than for lda and qda.
set.seed(1)
# Fit the model
DA4 <- mda(diagnosis~., data = ORIGINAL.DA.train)
# predictions prob
DA4.pred.valid.prob <- predict(DA4, ORIGINAL.DA.valid, type = "posterior")[,2]
# predictions response
DA4.pred.valid.resp <- factor(ifelse(DA4.pred.valid.prob > 0.5, 1, 0))
# confusion matrix
DA4.conf.mat <- confusionMatrix(DA4.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA4.conf.mat, 'Mixture discriminant analysis')
We see that this method in general works better than the lda and qda. In this case we don’t proceed with this model as it isn’t better in sensitivity which is what we want when predicting cancer.
Down below we run a flexible discriminant analysis (fda) which is an extension of lda using non-linear combinations of predictors (splines)
set.seed(1)
# Fit the model
DA5 <- fda(diagnosis~., data = ORIGINAL.DA.train)
# predictions prob
DA5.pred.valid.prob <- predict(DA5, ORIGINAL.DA.valid, type = "posterior")[,2]
# predictions response
DA5.pred.valid.resp <- factor(ifelse(DA5.pred.valid.prob > 0.5, 1, 0))
# confusion matrix
DA5.conf.mat <- confusionMatrix(DA5.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA5.conf.mat, titleaddon = 'Flexible discriminant analysis')
We see that the model has a good accuracy but isn’t good in sensitivity. Therefore we don’t use it further.
Lastly we fit a regularized discriminant analysis (RDA) which is a trade off between qda and lda.
set.seed(1)
# Fit the model
DA5 <- rda(diagnosis~., data = data.frame(ORIGINAL.DA.train))
# predictions prob
DA5.pred.valid.prob <- predict(DA5, data.frame(ORIGINAL.DA.valid))$posterior[,2]
# predictions response
DA5.pred.valid.resp <- factor(ifelse(DA5.pred.valid.prob > 0.5, 1, 0))
# confusion matrix
DA5.conf.mat <- confusionMatrix(DA5.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA5.conf.mat, titleaddon = 'Regularized discriminant analysis')
This model isn’t better than any other model therefore we don’t use it further.
As we have seen the highest sensitvity was in qda with acceptable accuracy we chose this model as the best one.
# Confusion Best
Best_DA_Confusion <- DA2.conf.mat
# Predictions Best
Best_DA_Predictions_Dummy <- DA2.pred.valid.resp
Best_DA_Predictions_Dummy <- factor(Best_DA_Predictions_Dummy)
Best_DA_Predictions_Probabilities <- DA2.pred.valid.prob
# Best Predictions as Data frame
DF_Best_DA_Predictions <- data.frame(Best_DA_Predictions_Dummy, Best_DA_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_DA_Confusion <- data.frame(c(Best_DA_Confusion$byClass[c(1,2)], Best_DA_Confusion$overall[1]))
colnames(DF_Best_DA_Confusion) <- "Best DA"
DF_Best_DA_Confusion <- t(DF_Best_DA_Confusion) # FINAL CONFUSION DATAFRAME
set.seed(1)
bagging<- bagging(diagnosis ~ ., data =Training_M)
bag_pred<- predict(bagging, Validation_M, type="class")
bag_cf <- confusionMatrix(as.factor(bag_pred$class), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(bag_cf)
set.seed(1)
boosting <- boosting(diagnosis ~ ., data = Training_M)
boost_pred<- predict(boosting, Validation_M, type="class")
boost_cf <- confusionMatrix(as.factor(boost_pred$class), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(boost_cf)
set.seed(1)
rand_f <- randomForest(diagnosis ~ ., data = Training_M, mtry=4, importance = T)
varImpPlot(rand_f, type=1,cex = 0.7) # we print out the variable importance plot too
rf_pred<- predict(rand_f, Validation_M, type="class")
rf_cf <- confusionMatrix(as.factor(rf_pred), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(rf_cf)
# For the boosting
response_boost <- data.frame(Validation_M[,1], boost_pred$class)
response_boost$Validation_M...1.<- as.numeric(as.character(response_boost[,1]))
response_boost$boost_pred.class<- as.numeric(as.character(response_boost[,2]))
roc_score_boost <- roc(data= response_boost , response=Validation_M...1., boost_pred.class) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# For bagging
response_bag <- data.frame(Validation_M[,1], bag_pred$class)
response_bag$Validation_M...1.<- as.numeric(as.character(response_bag[,1]))
response_bag$bag_pred.class<- as.numeric(as.character(response_bag[,2]))
roc_score_bag <- roc(data= response_bag , response=Validation_M...1., bag_pred.class) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# For Random Forests
response_rf <- data.frame(Validation_M[,1], rf_pred)
response_rf$Validation_M...1.<- as.numeric(as.character(response_rf[,1]))
response_rf$rf_pred<- as.numeric(as.character(response_rf[,2]))
roc_score_rf <- roc(data= response_rf , response=Validation_M...1., rf_pred) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curves side by side
par(mfrow=c(1,3))
plot(roc_score_boost ,main ="ROC curve for Boosting", cex= 1)
plot(roc_score_bag ,main ="ROC curve for Bagging", cex=1)
plot(roc_score_rf ,main ="ROC curve for Random Forests", cex=1)
# Best tree model is the boosted tree
# Best tree confusion matrix
best_boosted_confusion <- boost_cf
#predictions for best confusion matrix
best_boosted_pred <- boost_pred
# Probabilities of predictions
best_boosted_prob<- predict(boosting, Validation_M, type="prob")
# Prediction and probabilities data-frame
DF_best_boosted_pred <- data.frame(best_boosted_pred$class,best_boosted_prob$prob[,2])
#table of Confusion matrix values
DF_best_boosted_confusion <- data.frame(c(best_boosted_confusion$byClass[c(1,2)], best_boosted_confusion$overall[1]))
colnames(DF_best_boosted_confusion) <- "Boosted Tree Model"
DF_best_boosted_confusion<- t(DF_best_boosted_confusion)
## Best_Logistic_Predictions_Dummy Best_Logistic_Predictions_Probabilities
## 1 1 0.999998884279
## 2 1 0.764995713849
## 3 1 0.968538861355
## 4 1 0.931515195197
## 5 1 0.990662066546
## 6 1 0.875723102074
## 7 0 0.309018569943
## 8 1 0.999321090298
## 9 0 0.371587814346
## 10 1 0.999067567458
## 11 1 0.675002354640
## 12 0 0.000077823130
## 13 0 0.079744986206
## 14 1 0.999998815292
## 15 0 0.447995457713
## 16 0 0.447956038786
## 17 1 0.999996624809
## 18 0 0.001096642226
## 19 0 0.061921993277
## 20 1 0.914933882168
## 21 0 0.000487539786
## 22 1 0.723279433991
## 23 0 0.002426941623
## 24 1 0.999999996385
## 25 1 0.695308338264
## 26 0 0.003461771367
## 27 0 0.385131694427
## 28 0 0.005757588165
## 29 1 0.996757181863
## 30 1 0.999982530644
## 31 1 0.832397618429
## 32 0 0.105144180129
## 33 0 0.000073595543
## 34 0 0.009073336233
## 35 0 0.022126770357
## 36 0 0.000050824722
## 37 1 0.994816310902
## 38 0 0.008170151323
## 39 0 0.228811168809
## 40 1 0.999973242032
## 41 0 0.009449468658
## 42 1 0.723819821987
## 43 0 0.022136113515
## 44 0 0.000158845010
## 45 0 0.000796344731
## 46 0 0.000623730356
## 47 0 0.000013039320
## 48 1 0.862758616151
## 49 0 0.000052000219
## 50 0 0.045376732585
## 51 0 0.003793978558
## 52 0 0.003023703028
## 53 0 0.140343168303
## 54 1 0.994900095799
## 55 0 0.020250130231
## 56 1 0.997278972177
## 57 0 0.007964105607
## 58 0 0.001746447355
## 59 1 0.997661830162
## 60 0 0.000064475980
## 61 0 0.000056528523
## 62 1 0.998427484755
## 63 0 0.006931465439
## 64 0 0.000578360576
## 65 1 0.992294603988
## 66 0 0.006197709274
## 67 1 0.975214547873
## 68 1 0.999999999921
## 69 1 0.691051595907
## 70 0 0.000473138075
## 71 0 0.170343180410
## 72 1 1.000000000000
## 73 0 0.014410223017
## 74 1 0.951609407809
## 75 0 0.176230468310
## 76 1 0.999970030499
## 77 0 0.017365719016
## 78 1 0.999999999788
## 79 1 0.999900452658
## 80 0 0.000180884137
## 81 1 0.999999171928
## 82 1 0.998460014879
## 83 1 0.999992661214
## 84 0 0.087245511902
## 85 1 0.999615924199
## 86 0 0.001936983381
## 87 0 0.003252824380
## 88 0 0.018134138230
## 89 0 0.000975576735
## 90 0 0.001084531369
## 91 0 0.002616531588
## 92 1 0.999986405229
## 93 0 0.000190604286
## 94 0 0.000189016498
## 95 0 0.003412507401
## 96 0 0.000073217293
## 97 1 0.977909201124
## 98 0 0.000382518300
## 99 0 0.010933841771
## 100 0 0.004309603619
## 101 0 0.000220894978
## 102 1 1.000000000000
## 103 0 0.004122332728
## 104 0 0.007017374293
## 105 1 0.999999812265
## 106 0 0.054930762660
## 107 0 0.000473696893
## 108 0 0.001822900621
## 109 0 0.001913543870
## 110 1 0.999999988169
## 111 0 0.137803855881
## 112 1 0.999969986215
## 113 0 0.001615127586
## 114 0 0.000533952997
## 115 0 0.000007471704
## 116 0 0.453693547937
## 117 0 0.115060254023
## 118 0 0.006452918089
## 119 0 0.473136989065
## 120 0 0.003164996296
## 121 1 0.999847047350
## 122 0 0.004296573993
## 123 0 0.000906813205
## 124 0 0.000057345126
## 125 0 0.005165357314
## 126 0 0.005150291371
## 127 0 0.001804794845
## 128 0 0.062674062724
## 129 0 0.336494350429
## 130 0 0.152993823354
## 131 1 0.992930485924
## 132 0 0.000517600421
## 133 1 0.875947037782
## 134 0 0.006788665821
## 135 0 0.008687277132
## 136 0 0.057282662110
## 137 1 0.862352744392
## 138 0 0.021399111264
## 139 1 0.999999991490
## 140 1 0.884513809822
## 141 1 0.998506675406
## 142 0 0.049946876869
## 143 0 0.194272366445
## 144 1 0.999211716151
## 145 1 0.999890029439
## 146 0 0.009781052378
## 147 1 0.999999999954
## 148 0 0.009735025868
## 149 0 0.002310519659
## 150 0 0.000237651172
## 151 0 0.085351246061
## 152 0 0.000021794342
## 153 0 0.000268973544
## 154 0 0.000126063015
## 155 0 0.007535119744
## 156 0 0.028136026770
## 157 1 0.999999744789
## 158 1 0.999896670077
## 159 1 0.999999983750
## Sensitivity Specificity Accuracy
## Best Logistic Lasso Regression 0.8181818 0.9569892 0.8993711
## Best_KNN_Predictions_Dummy Best_KNN_Predictions_Probabilities
## 1 1 1.0000000
## 2 1 0.8333333
## 3 1 1.0000000
## 4 1 1.0000000
## 5 1 1.0000000
## 6 1 0.6666667
## 7 0 0.0000000
## 8 1 1.0000000
## 9 1 0.8333333
## 10 1 1.0000000
## 11 1 0.8333333
## 12 1 0.5000000
## 13 0 0.3333333
## 14 1 1.0000000
## 15 1 0.6666667
## 16 0 0.5000000
## 17 1 1.0000000
## 18 0 0.0000000
## 19 0 0.0000000
## 20 1 1.0000000
## 21 0 0.0000000
## 22 1 1.0000000
## 23 0 0.0000000
## 24 1 1.0000000
## 25 1 0.8333333
## 26 0 0.0000000
## 27 1 0.8333333
## 28 0 0.0000000
## 29 1 1.0000000
## 30 1 1.0000000
## 31 0 0.1666667
## 32 0 0.5000000
## 33 0 0.0000000
## 34 0 0.0000000
## 35 0 0.0000000
## 36 0 0.0000000
## 37 1 1.0000000
## 38 0 0.0000000
## 39 0 0.3333333
## 40 1 1.0000000
## 41 0 0.0000000
## 42 0 0.0000000
## 43 0 0.0000000
## 44 0 0.1666667
## 45 0 0.0000000
## 46 0 0.0000000
## 47 0 0.0000000
## 48 1 1.0000000
## 49 0 0.0000000
## 50 0 0.0000000
## 51 0 0.0000000
## 52 0 0.0000000
## 53 0 0.0000000
## 54 1 1.0000000
## 55 0 0.0000000
## 56 1 1.0000000
## 57 0 0.0000000
## 58 0 0.0000000
## 59 1 1.0000000
## 60 0 0.0000000
## 61 0 0.0000000
## 62 1 0.8333333
## 63 0 0.0000000
## 64 0 0.0000000
## 65 1 1.0000000
## 66 0 0.0000000
## 67 1 0.8333333
## 68 1 1.0000000
## 69 0 0.1666667
## 70 0 0.0000000
## 71 0 0.0000000
## 72 1 1.0000000
## 73 0 0.0000000
## 74 1 1.0000000
## 75 0 0.0000000
## 76 1 1.0000000
## 77 0 0.0000000
## 78 1 1.0000000
## 79 1 1.0000000
## 80 0 0.0000000
## 81 1 1.0000000
## 82 1 0.8333333
## 83 1 1.0000000
## 84 0 0.1666667
## 85 1 1.0000000
## 86 0 0.0000000
## 87 0 0.0000000
## 88 0 0.0000000
## 89 0 0.0000000
## 90 0 0.0000000
## 91 0 0.0000000
## 92 1 1.0000000
## 93 0 0.0000000
## 94 0 0.0000000
## 95 0 0.0000000
## 96 0 0.0000000
## 97 1 0.6666667
## 98 0 0.0000000
## 99 0 0.0000000
## 100 0 0.0000000
## 101 0 0.0000000
## 102 1 1.0000000
## 103 0 0.0000000
## 104 0 0.0000000
## 105 1 1.0000000
## 106 0 0.0000000
## 107 0 0.0000000
## 108 0 0.0000000
## 109 0 0.0000000
## 110 1 1.0000000
## 111 0 0.1666667
## 112 1 1.0000000
## 113 0 0.0000000
## 114 0 0.0000000
## 115 0 0.0000000
## 116 0 0.3333333
## 117 0 0.0000000
## 118 0 0.0000000
## 119 1 0.5000000
## 120 0 0.0000000
## 121 1 1.0000000
## 122 0 0.0000000
## 123 0 0.0000000
## 124 0 0.0000000
## 125 0 0.0000000
## 126 0 0.0000000
## 127 0 0.0000000
## 128 0 0.0000000
## 129 0 0.1666667
## 130 0 0.0000000
## 131 1 0.6666667
## 132 0 0.0000000
## 133 0 0.1666667
## 134 0 0.1666667
## 135 0 0.0000000
## 136 0 0.1666667
## 137 1 0.8333333
## 138 0 0.0000000
## 139 1 1.0000000
## 140 0 0.0000000
## 141 1 1.0000000
## 142 0 0.0000000
## 143 0 0.1666667
## 144 1 1.0000000
## 145 1 1.0000000
## 146 0 0.0000000
## 147 1 1.0000000
## 148 0 0.0000000
## 149 0 0.1666667
## 150 0 0.0000000
## 151 0 0.0000000
## 152 0 0.0000000
## 153 0 0.0000000
## 154 0 0.0000000
## 155 0 0.0000000
## 156 0 0.1666667
## 157 1 1.0000000
## 158 1 1.0000000
## 159 1 1.0000000
## Sensitivity Specificity Accuracy
## Best KNN 0.8484848 0.9784946 0.9245283
## best_boosted_pred.class best_boosted_prob.prob...2.
## 1 1 1.000000000
## 2 1 0.774589153
## 3 1 0.900655350
## 4 1 0.840310897
## 5 1 0.840552805
## 6 1 0.975346710
## 7 0 0.173319532
## 8 1 0.770050514
## 9 1 0.813777944
## 10 1 1.000000000
## 11 1 0.666069277
## 12 0 0.485140647
## 13 0 0.472907382
## 14 1 0.867183905
## 15 1 0.827485860
## 16 1 0.551442850
## 17 1 0.943153946
## 18 0 0.104094062
## 19 0 0.149309845
## 20 1 0.896966945
## 21 0 0.056507886
## 22 1 0.953335671
## 23 0 0.027904221
## 24 1 0.888562394
## 25 1 0.570923920
## 26 0 0.086972061
## 27 0 0.350104997
## 28 0 0.176722195
## 29 1 0.968597971
## 30 1 0.957678122
## 31 1 0.650022797
## 32 1 0.510345761
## 33 0 0.022008290
## 34 0 0.066225986
## 35 0 0.217549506
## 36 0 0.125683900
## 37 1 0.926877683
## 38 0 0.082787917
## 39 0 0.476420033
## 40 1 0.990015244
## 41 0 0.094234128
## 42 0 0.331647435
## 43 0 0.119130313
## 44 0 0.334636340
## 45 0 0.074264895
## 46 0 0.044511696
## 47 0 0.095300075
## 48 1 0.925137213
## 49 0 0.018748451
## 50 0 0.152561423
## 51 0 0.087173859
## 52 0 0.083312358
## 53 0 0.122188715
## 54 1 0.931222911
## 55 0 0.034332890
## 56 1 0.929496528
## 57 0 0.115052194
## 58 0 0.089774503
## 59 1 0.969821814
## 60 0 0.071616017
## 61 0 0.122274949
## 62 1 0.859281159
## 63 0 0.089287961
## 64 0 0.017856928
## 65 1 0.744014206
## 66 0 0.000000000
## 67 1 0.823415386
## 68 1 0.897342351
## 69 1 0.556676801
## 70 0 0.028042743
## 71 0 0.190829375
## 72 1 0.757541299
## 73 0 0.051956002
## 74 1 0.975981357
## 75 0 0.263328913
## 76 1 0.937404465
## 77 0 0.073495304
## 78 1 0.990015244
## 79 1 0.918629913
## 80 0 0.021196511
## 81 1 0.981248482
## 82 1 0.971595461
## 83 1 0.990015244
## 84 0 0.244175154
## 85 1 0.970748769
## 86 0 0.027356306
## 87 0 0.072567968
## 88 0 0.062795569
## 89 0 0.104884677
## 90 0 0.030938610
## 91 0 0.072409791
## 92 1 0.991757675
## 93 0 0.039018248
## 94 0 0.137619467
## 95 0 0.031034093
## 96 0 0.030434358
## 97 1 0.641961337
## 98 0 0.036588813
## 99 0 0.039552469
## 100 0 0.074014837
## 101 0 0.017578831
## 102 1 0.974192288
## 103 0 0.051715684
## 104 0 0.057666986
## 105 1 0.964114863
## 106 0 0.056924331
## 107 0 0.042011303
## 108 0 0.073407002
## 109 0 0.000000000
## 110 1 1.000000000
## 111 0 0.495743385
## 112 1 0.980094569
## 113 0 0.009706093
## 114 0 0.073757104
## 115 0 0.022309543
## 116 0 0.461477113
## 117 0 0.354867644
## 118 0 0.022597626
## 119 0 0.334561463
## 120 0 0.087071908
## 121 1 0.958423702
## 122 0 0.067040824
## 123 0 0.177045406
## 124 0 0.079493545
## 125 0 0.168245844
## 126 0 0.109404005
## 127 0 0.130552835
## 128 0 0.195253417
## 129 0 0.360113570
## 130 0 0.250925079
## 131 1 0.810531285
## 132 0 0.074490889
## 133 1 0.593280237
## 134 0 0.140916383
## 135 0 0.046842791
## 136 0 0.287045790
## 137 1 0.670715994
## 138 0 0.070675559
## 139 1 0.936132593
## 140 0 0.213574145
## 141 1 0.892803380
## 142 0 0.255380052
## 143 0 0.453903337
## 144 1 1.000000000
## 145 1 1.000000000
## 146 0 0.093101229
## 147 1 1.000000000
## 148 0 0.140118064
## 149 0 0.271646140
## 150 0 0.067638368
## 151 0 0.323901690
## 152 0 0.027631463
## 153 0 0.059894681
## 154 0 0.009237847
## 155 0 0.101730559
## 156 0 0.112283477
## 157 1 0.959194619
## 158 1 0.911177622
## 159 1 0.938837924
## Sensitivity Specificity Accuracy
## Boosted Tree Model 0.8787879 0.9784946 0.9371069
## Best_DA_Predictions_Dummy
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 0
## 8 1
## 9 1
## 10 1
## 11 1
## 12 1
## 13 1
## 14 1
## 15 1
## 16 1
## 17 1
## 18 0
## 19 0
## 20 1
## 21 0
## 22 1
## 23 0
## 24 1
## 25 1
## 26 0
## 27 0
## 28 0
## 29 1
## 30 1
## 31 0
## 32 1
## 33 0
## 34 0
## 35 0
## 36 0
## 37 1
## 38 0
## 39 1
## 40 1
## 41 0
## 42 0
## 43 0
## 44 0
## 45 0
## 46 0
## 47 0
## 48 1
## 49 0
## 50 0
## 51 0
## 52 0
## 53 0
## 54 1
## 55 0
## 56 1
## 57 0
## 58 0
## 59 1
## 60 0
## 61 0
## 62 1
## 63 0
## 64 0
## 65 1
## 66 0
## 67 1
## 68 1
## 69 1
## 70 0
## 71 0
## 72 1
## 73 0
## 74 1
## 75 1
## 76 1
## 77 0
## 78 1
## 79 1
## 80 0
## 81 1
## 82 1
## 83 1
## 84 0
## 85 1
## 86 0
## 87 0
## 88 0
## 89 0
## 90 0
## 91 0
## 92 1
## 93 0
## 94 0
## 95 0
## 96 0
## 97 1
## 98 0
## 99 0
## 100 0
## 101 0
## 102 1
## 103 0
## 104 0
## 105 1
## 106 0
## 107 0
## 108 0
## 109 0
## 110 1
## 111 0
## 112 1
## 113 0
## 114 0
## 115 0
## 116 1
## 117 1
## 118 0
## 119 1
## 120 0
## 121 1
## 122 0
## 123 0
## 124 0
## 125 0
## 126 0
## 127 0
## 128 0
## 129 0
## 130 0
## 131 1
## 132 0
## 133 1
## 134 0
## 135 0
## 136 0
## 137 1
## 138 0
## 139 1
## 140 1
## 141 1
## 142 0
## 143 1
## 144 1
## 145 1
## 146 0
## 147 1
## 148 0
## 149 0
## 150 0
## 151 0
## 152 0
## 153 0
## 154 0
## 155 0
## 156 0
## 157 1
## 158 1
## 159 1
## Best_DA_Predictions_Probabilities
## 1 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 2 0.9999999999999975575093458246556110680103302001953125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 3 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 4 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 5 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 6 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 7 0.0000710520812996915984748394490289058467169525101780891418457031250000000000000000000000000000000000000000000000000000000000000000
## 8 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 9 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 10 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 11 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 12 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 13 0.9997780399279516672095269314013421535491943359375000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 14 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 15 0.9999999999999980015985556747182272374629974365234375000000000000000000000000000000000000000000000000000000000000000000000000000000
## 16 0.9999999999999962252417162744677625596523284912109375000000000000000000000000000000000000000000000000000000000000000000000000000000
## 17 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 18 0.0000027803801730188075838623440461105928989127278327941894531250000000000000000000000000000000000000000000000000000000000000000000
## 19 0.0010635516868209913925180654103996857884339988231658935546875000000000000000000000000000000000000000000000000000000000000000000000
## 20 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 21 0.0000000000000000040089758134594085207946800863965108874253928661346435546875000000000000000000000000000000000000000000000000000000
## 22 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 23 0.0000000000000000877229703080208169079651225885640997148584574460983276367187500000000000000000000000000000000000000000000000000000
## 24 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 25 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 26 0.0000000996704046435446652102418951812978775706142187118530273437500000000000000000000000000000000000000000000000000000000000000000
## 27 0.0000111503138261254224593028300249741846528195310384035110473632812500000000000000000000000000000000000000000000000000000000000000
## 28 0.0021567715433414235380593648727653999230824410915374755859375000000000000000000000000000000000000000000000000000000000000000000000
## 29 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 30 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 31 0.0130150785282694327754660079676796158310025930404663085937500000000000000000000000000000000000000000000000000000000000000000000000
## 32 0.9999999999733575339888602684368379414081573486328125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 33 0.0000000000000000002939943269257253841142407679853931767866015434265136718750000000000000000000000000000000000000000000000000000000
## 34 0.0000000230902453492269946742389274962903300547623075544834136962890625000000000000000000000000000000000000000000000000000000000000
## 35 0.0000000000011069818657987654447272599100848822217812994495034217834472656250000000000000000000000000000000000000000000000000000000
## 36 0.0000000000000000000000000000000000000146183446910236947277259011990224735200172290205955505371093750000000000000000000000000000000
## 37 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 38 0.0000000000000000027605226014118660095297830281424467102624475955963134765625000000000000000000000000000000000000000000000000000000
## 39 0.9999999845735978443173053165082819759845733642578125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 40 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 41 0.0000003764950487508283739467723760796502574521582573652267456054687500000000000000000000000000000000000000000000000000000000000000
## 42 0.0051374046439231994126695113322966790292412042617797851562500000000000000000000000000000000000000000000000000000000000000000000000
## 43 0.0000008272240157858174239985382669715363590512424707412719726562500000000000000000000000000000000000000000000000000000000000000000
## 44 0.0000000000002704382056290124125780618968839519311586627736687660217285156250000000000000000000000000000000000000000000000000000000
## 45 0.0000000092673017684619869646645007321694720303639769554138183593750000000000000000000000000000000000000000000000000000000000000000
## 46 0.0000000000000000000000000087245171358322165643811585411526721145492047071456909179687500000000000000000000000000000000000000000000
## 47 0.0000000000000000000000001768051300108728459336768223941760425077518448233604431152343750000000000000000000000000000000000000000000
## 48 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 49 0.0000000000000000008043797097043138683708507485903282940853387117385864257812500000000000000000000000000000000000000000000000000000
## 50 0.0000000000001832346311407060256322421043506665228051133453845977783203125000000000000000000000000000000000000000000000000000000000
## 51 0.0000000000021207670678241855551689892811495496971474494785070419311523437500000000000000000000000000000000000000000000000000000000
## 52 0.0000000000000000000000000000000000000000000000006757991436695814522518654943183946670615114271640777587890625000000000000000000000
## 53 0.0303452790768879709926153509513824246823787689208984375000000000000000000000000000000000000000000000000000000000000000000000000000
## 54 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 55 0.0000000004343661207081146368065571738981134330970235168933868408203125000000000000000000000000000000000000000000000000000000000000
## 56 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 57 0.0000000061411735570359950800445325569398846710100769996643066406250000000000000000000000000000000000000000000000000000000000000000
## 58 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008757152
## 59 0.9999994164038339405919941782485693693161010742187500000000000000000000000000000000000000000000000000000000000000000000000000000000
## 60 0.0000000000000000116167084200637080334227169142735647255904041230678558349609375000000000000000000000000000000000000000000000000000
## 61 0.0000000000000000448231454215971346181912604578201353433541953563690185546875000000000000000000000000000000000000000000000000000000
## 62 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 63 0.0000024502535460829765246761058339330929811694659292697906494140625000000000000000000000000000000000000000000000000000000000000000
## 64 0.0000000000011423855564940835118775686041203698550816625356674194335937500000000000000000000000000000000000000000000000000000000000
## 65 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 66 0.0000000001414101770339002895737062770464831373828928917646408081054687500000000000000000000000000000000000000000000000000000000000
## 67 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 68 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 69 0.9999999999998270272527634006110019981861114501953125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 70 0.0000000000000000116227384358952179083807054738031183660496026277542114257812500000000000000000000000000000000000000000000000000000
## 71 0.0263144216679913334200335839341278187930583953857421875000000000000000000000000000000000000000000000000000000000000000000000000000
## 72 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 73 0.0000000013929124464690875896850930093506804041680879890918731689453125000000000000000000000000000000000000000000000000000000000000
## 74 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 75 0.9810254462575809242252944386564195156097412109375000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 76 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 77 0.0000019942506748477742234762488671151459129760041832923889160156250000000000000000000000000000000000000000000000000000000000000000
## 78 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 79 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 80 0.0000000000351715654907029313469904452382763793139019981026649475097656250000000000000000000000000000000000000000000000000000000000
## 81 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 82 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 83 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 84 0.0274363140005409074373421418613361311145126819610595703125000000000000000000000000000000000000000000000000000000000000000000000000
## 85 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 86 0.0000000008457301625076878005479730227023082989035174250602722167968750000000000000000000000000000000000000000000000000000000000000
## 87 0.0000118797690332885366519891090497651475743623450398445129394531250000000000000000000000000000000000000000000000000000000000000000
## 88 0.0000018374743936530151301039781586155186232645064592361450195312500000000000000000000000000000000000000000000000000000000000000000
## 89 0.0000000000012144522442154906622996043963347290173260262235999107360839843750000000000000000000000000000000000000000000000000000000
## 90 0.0000000014009140770926996696736643865932592234457843005657196044921875000000000000000000000000000000000000000000000000000000000000
## 91 0.0000000029318132008440507733568192172413091611815616488456726074218750000000000000000000000000000000000000000000000000000000000000
## 92 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 93 0.0000000000815580245323561980191895415259750734549015760421752929687500000000000000000000000000000000000000000000000000000000000000
## 94 0.0000000000000000000000002360773661836484148702358654148270034056622534990310668945312500000000000000000000000000000000000000000000
## 95 0.0000000000000000930329034072286089815384535484099615132436156272888183593750000000000000000000000000000000000000000000000000000000
## 96 0.0000000000000005574846518635445458307586807933375894208438694477081298828125000000000000000000000000000000000000000000000000000000
## 97 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 98 0.0000000000000027444488122209030548769720381230285966012161225080490112304687500000000000000000000000000000000000000000000000000000
## 99 0.0000000003613146156164692752608622039467434206017060205340385437011718750000000000000000000000000000000000000000000000000000000000
## 100 0.0000000000012768770254743897038691990974967893635039217770099639892578125000000000000000000000000000000000000000000000000000000000
## 101 0.0000000000000000000261773200721502965083314906635791885491926223039627075195312500000000000000000000000000000000000000000000000000
## 102 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 103 0.0000000006083382439248878489033989236389743382460437715053558349609375000000000000000000000000000000000000000000000000000000000000
## 104 0.0000003353599720048645211299687129979929522960446774959564208984375000000000000000000000000000000000000000000000000000000000000000
## 105 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 106 0.0000016041792108189707474243113072276400998816825449466705322265625000000000000000000000000000000000000000000000000000000000000000
## 107 0.0000000000000000497518729627603853351430762685936315392609685659408569335937500000000000000000000000000000000000000000000000000000
## 108 0.0000000000000000018424143769188722083440801213782833656296133995056152343750000000000000000000000000000000000000000000000000000000
## 109 0.0000000000015525206102598691707949302154290194266650360077619552612304687500000000000000000000000000000000000000000000000000000000
## 110 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 111 0.0000046403604357248198759682000780202315581846050918102264404296875000000000000000000000000000000000000000000000000000000000000000
## 112 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 113 0.0000000000054447604066151702997475303469343543838476762175559997558593750000000000000000000000000000000000000000000000000000000000
## 114 0.0000000000000000854468874593949442950574502120275610650423914194107055664062500000000000000000000000000000000000000000000000000000
## 115 0.0000000000000000000000000328478098195932424578635799861103805596940219402313232421875000000000000000000000000000000000000000000000
## 116 0.9631478107009148192929615106550045311450958251953125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 117 0.9632965163312937617590137051593046635389328002929687500000000000000000000000000000000000000000000000000000000000000000000000000000
## 118 0.0000000520810023534324404315826173572645529930014163255691528320312500000000000000000000000000000000000000000000000000000000000000
## 119 0.9997615940675353973787764516600873321294784545898437500000000000000000000000000000000000000000000000000000000000000000000000000000
## 120 0.0000000047096392859003487728342118856517117819748818874359130859375000000000000000000000000000000000000000000000000000000000000000
## 121 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 122 0.0000000001868125427808899112567392331030191598983947187662124633789062500000000000000000000000000000000000000000000000000000000000
## 123 0.0000000000567889963290498762330665849518140930740628391504287719726562500000000000000000000000000000000000000000000000000000000000
## 124 0.0000000000000000000000000000000000000000958426943095521203554165712290568990283645689487457275390625000000000000000000000000000000
## 125 0.0000014772734178086702426773574314822212727449368685483932495117187500000000000000000000000000000000000000000000000000000000000000
## 126 0.0000000000002627735229406056433098040625573332818021299317479133605957031250000000000000000000000000000000000000000000000000000000
## 127 0.0000000101304546106378335571657212099694334028754383325576782226562500000000000000000000000000000000000000000000000000000000000000
## 128 0.1220332778200839546345335406840604264289140701293945312500000000000000000000000000000000000000000000000000000000000000000000000000
## 129 0.0000000000803487315692802863548171998786529002245515584945678710937500000000000000000000000000000000000000000000000000000000000000
## 130 0.0453058045180589480382948863734782207757234573364257812500000000000000000000000000000000000000000000000000000000000000000000000000
## 131 0.9999999753534921653752576276019681245088577270507812500000000000000000000000000000000000000000000000000000000000000000000000000000
## 132 0.0000000031907750854301417172288052315209938569751102477312088012695312500000000000000000000000000000000000000000000000000000000000
## 133 0.9098006723928865335793148005905095487833023071289062500000000000000000000000000000000000000000000000000000000000000000000000000000
## 134 0.0000000000000000000000000000000001151321157572149220553112125564609868888510391116142272949218750000000000000000000000000000000000
## 135 0.0000004996853867304061334961989837566420646908227354288101196289062500000000000000000000000000000000000000000000000000000000000000
## 136 0.0051162367150289550163377860769742255797609686851501464843750000000000000000000000000000000000000000000000000000000000000000000000
## 137 0.9999999999901685310277343887719325721263885498046875000000000000000000000000000000000000000000000000000000000000000000000000000000
## 138 0.0000000000394102933786781959336897873669158798293210566043853759765625000000000000000000000000000000000000000000000000000000000000
## 139 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 140 0.9994345011748987372968144882179331034421920776367187500000000000000000000000000000000000000000000000000000000000000000000000000000
## 141 0.9999999999999593658372987192706204950809478759765625000000000000000000000000000000000000000000000000000000000000000000000000000000
## 142 0.0007332173650095377692714748718572081997990608215332031250000000000000000000000000000000000000000000000000000000000000000000000000
## 143 0.9999989810572786907982845150399953126907348632812500000000000000000000000000000000000000000000000000000000000000000000000000000000
## 144 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 145 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 146 0.0000000000470037617179969324527483154341211957216728478670120239257812500000000000000000000000000000000000000000000000000000000000
## 147 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 148 0.0000002317276715220916731043099279219177333288826048374176025390625000000000000000000000000000000000000000000000000000000000000000
## 149 0.0000000849626241133124053550584875438289600424468517303466796875000000000000000000000000000000000000000000000000000000000000000000
## 150 0.0000000000000000000000000000000000000000000000000000000000000000001074880156031621173252747769577553071940201334655284881591796875
## 151 0.0008170369274782562397929641662130961776711046695709228515625000000000000000000000000000000000000000000000000000000000000000000000
## 152 0.0000000000000000000000265610828941182368833837101895767318637808784842491149902343750000000000000000000000000000000000000000000000
## 153 0.0000000000000000215275713379214336286315845114813782856799662113189697265625000000000000000000000000000000000000000000000000000000
## 154 0.0000000000000448203982122825164398707831203694240684853866696357727050781250000000000000000000000000000000000000000000000000000000
## 155 0.0000000004528835437026721726615219321132599361590109765529632568359375000000000000000000000000000000000000000000000000000000000000
## 156 0.0000142084500430176112343097494239430034213000908493995666503906250000000000000000000000000000000000000000000000000000000000000000
## 157 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 158 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 159 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Sensitivity Specificity Accuracy
## Best DA 0.9393939 0.9354839 0.9371069
## Best_Neural_Network_Predictions_Dummy
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 0
## 8 1
## 9 1
## 10 1
## 11 1
## 12 1
## 13 1
## 14 1
## 15 1
## 16 1
## 17 1
## 18 0
## 19 0
## 20 1
## 21 0
## 22 1
## 23 0
## 24 1
## 25 1
## 26 0
## 27 1
## 28 0
## 29 1
## 30 1
## 31 1
## 32 1
## 33 0
## 34 0
## 35 0
## 36 0
## 37 1
## 38 0
## 39 1
## 40 1
## 41 0
## 42 0
## 43 0
## 44 0
## 45 0
## 46 0
## 47 0
## 48 1
## 49 0
## 50 0
## 51 0
## 52 0
## 53 0
## 54 1
## 55 0
## 56 1
## 57 0
## 58 0
## 59 1
## 60 0
## 61 0
## 62 1
## 63 0
## 64 0
## 65 1
## 66 0
## 67 1
## 68 1
## 69 1
## 70 0
## 71 0
## 72 1
## 73 0
## 74 1
## 75 0
## 76 1
## 77 0
## 78 1
## 79 1
## 80 0
## 81 1
## 82 1
## 83 1
## 84 0
## 85 1
## 86 0
## 87 0
## 88 0
## 89 0
## 90 0
## 91 0
## 92 1
## 93 0
## 94 0
## 95 0
## 96 0
## 97 1
## 98 0
## 99 0
## 100 0
## 101 0
## 102 1
## 103 0
## 104 0
## 105 1
## 106 0
## 107 0
## 108 0
## 109 0
## 110 1
## 111 0
## 112 1
## 113 0
## 114 0
## 115 0
## 116 0
## 117 1
## 118 0
## 119 0
## 120 0
## 121 1
## 122 0
## 123 0
## 124 0
## 125 1
## 126 0
## 127 0
## 128 0
## 129 0
## 130 0
## 131 1
## 132 0
## 133 0
## 134 0
## 135 0
## 136 0
## 137 1
## 138 0
## 139 1
## 140 0
## 141 1
## 142 0
## 143 1
## 144 1
## 145 1
## 146 0
## 147 1
## 148 0
## 149 0
## 150 0
## 151 0
## 152 0
## 153 0
## 154 0
## 155 0
## 156 0
## 157 1
## 158 1
## 159 1
## Best_Neural_Network_Predictions_Probabilities
## 1 0.99999369785
## 2 0.99995178208
## 3 0.99967082496
## 4 0.99989008907
## 5 0.99857862853
## 6 0.99999912764
## 7 0.00078424028
## 8 0.99058084821
## 9 0.99994862982
## 10 0.99953126797
## 11 0.99878830428
## 12 0.72952214013
## 13 0.99390807526
## 14 0.99781170241
## 15 0.99996796688
## 16 0.99995164895
## 17 0.99999599600
## 18 0.00558704745
## 19 0.00864241303
## 20 0.99998063324
## 21 0.00096614613
## 22 0.99996961811
## 23 0.00064219188
## 24 0.99937992989
## 25 0.73194142906
## 26 0.00007041882
## 27 0.88763184702
## 28 0.06256823669
## 29 0.99999330653
## 30 0.99997850011
## 31 0.94204935601
## 32 0.99999425033
## 33 0.00016598179
## 34 0.00148147353
## 35 0.00032244816
## 36 0.00043893643
## 37 0.99877808534
## 38 0.00047681857
## 39 0.99996851426
## 40 0.99998012687
## 41 0.00010367647
## 42 0.01692568339
## 43 0.00159618549
## 44 0.21706307688
## 45 0.00036450055
## 46 0.00040508702
## 47 0.00007923638
## 48 0.99999750162
## 49 0.00062170058
## 50 0.00018794521
## 51 0.00067805904
## 52 0.00019123316
## 53 0.00391673363
## 54 0.99999693290
## 55 0.00143229503
## 56 0.99999745969
## 57 0.00011794213
## 58 0.00012251458
## 59 0.99959655470
## 60 0.00155967132
## 61 0.00078002120
## 62 0.99989893827
## 63 0.00208466800
## 64 0.00051810352
## 65 0.93555293905
## 66 0.00034061069
## 67 0.99986030865
## 68 0.99998968257
## 69 0.63704553416
## 70 0.00039362585
## 71 0.00430321597
## 72 0.99999699122
## 73 0.00008087768
## 74 0.99996008706
## 75 0.10974143410
## 76 0.99999034491
## 77 0.00114921225
## 78 0.99999698967
## 79 0.99998295925
## 80 0.00051092692
## 81 0.99999514751
## 82 0.99994028448
## 83 0.99999238899
## 84 0.01615532669
## 85 0.99998819840
## 86 0.00206753151
## 87 0.00005984042
## 88 0.00047214295
## 89 0.00183945840
## 90 0.00054016441
## 91 0.00046749024
## 92 0.99999704596
## 93 0.00008849256
## 94 0.00401458187
## 95 0.00045657924
## 96 0.00056996081
## 97 0.99252959781
## 98 0.00044044525
## 99 0.00021118419
## 100 0.00060570614
## 101 0.00032680896
## 102 0.99999510902
## 103 0.00046435962
## 104 0.00033933671
## 105 0.99999705865
## 106 0.00052605403
## 107 0.00008015152
## 108 0.00086388358
## 109 0.00036713027
## 110 0.99999628406
## 111 0.10525678305
## 112 0.99988635872
## 113 0.00526028556
## 114 0.00028463503
## 115 0.00010447583
## 116 0.41176600162
## 117 0.98899476247
## 118 0.00008796902
## 119 0.07463847274
## 120 0.00126443964
## 121 0.99999616707
## 122 0.00015896985
## 123 0.00065866409
## 124 0.00001877255
## 125 0.50775408491
## 126 0.02588968679
## 127 0.00172216804
## 128 0.01476399870
## 129 0.10461971603
## 130 0.01896159103
## 131 0.75518213831
## 132 0.00130404413
## 133 0.40005131372
## 134 0.00012104954
## 135 0.00056154789
## 136 0.26018054706
## 137 0.99930499071
## 138 0.00032076466
## 139 0.99999599392
## 140 0.00503075742
## 141 0.99962126343
## 142 0.00868131201
## 143 0.94541060757
## 144 0.99999549248
## 145 0.99998950853
## 146 0.00260190515
## 147 0.99999683429
## 148 0.00047157738
## 149 0.20363102154
## 150 0.00031408588
## 151 0.12025797038
## 152 0.00016951203
## 153 0.00067645005
## 154 0.00020484482
## 155 0.00212811550
## 156 0.03153583689
## 157 0.99999699254
## 158 0.99999291105
## 159 0.99999699271
## Sensitivity Specificity Accuracy
## Best Neural Network 0.969697 0.9784946 0.9748428
Majority_DF <- data.frame(DF_Best_Logistic_Predictions[,1], DF_Best_KNN_Predictions[,1], DF_best_boosted_pred[,1], DF_Best_DA_Predictions[,1], DF_Best_Neural_Network_Predictions[,1])
for(i in 1:nrow(Majority_DF)){
if(sum(Majority_DF[i,1] + Majority_DF[i,2] + Majority_DF[i,3] + Majority_DF[i,4] + Majority_DF[i,5])/5 > 0.5){
Majority_DF[i,6] = 1
}else{Majority_DF[i,6] = 0}
}
colnames(Majority_DF) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Majority Vote")
Majority_DF$'Validation Actual' <- as.numeric(as.character(Validation$diagnosis))
DT::datatable(Majority_DF, caption = "Best 5 Models on Valildation - Majority of Votes")
Average_DF <- data.frame(DF_Best_Logistic_Predictions[,2], DF_Best_KNN_Predictions[,2], DF_best_boosted_pred[,2], DF_Best_DA_Predictions[,2], DF_Best_Neural_Network_Predictions[,2])
for(i in 1:nrow(Average_DF)){
Average_DF[i,6] <- sum(Average_DF[i,1] + Average_DF[i,2] + Average_DF[i,3] + Average_DF[i,4] + Average_DF[i,5])/5
}
colnames(Average_DF) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Average")
Average_DF$'Average Cutoff 0.5' <- ifelse(Average_DF$`Average`>0.5,1,0)
Average_DF$'Validation Actual' <- as.numeric(as.character(Validation$diagnosis))
DT::datatable(round(Average_DF,4), caption = "Best 5 Models on Validation - Average of Probabilities")
# Majority vote Confusion Matrix
Majority_factor <- data.frame(as.factor(Majority_DF[,6]), as.factor(Majority_DF[,7]))
Majority_confusion <- confusionMatrix(Majority_factor[,1], Majority_factor[,2], positive = "1")
draw_confusion_matrix(Majority_confusion)
# Average Confusion Matrix
Average_factor <- data.frame(as.factor(Average_DF[,7]), as.factor(Average_DF[,8]))
Average_confusion <- confusionMatrix(Average_factor[,1], Average_factor[,2], positive = "1")
draw_confusion_matrix(Average_confusion)
Comments: the Average seems to be better in Accuracy and
Sensitivity.
set.seed(1)
# Duplicate Original to a Cluster Dataframe
KClusteringDF <- ORIGINAL
# Make Sure to be as Dataframe
KClusteringDF <- data.frame(KClusteringDF)
# Remove the "ID" Variable
KClusteringDF <- KClusteringDF[,-1]
# Preprocess Data
Norm_Kmeans <- preProcess(KClusteringDF, method = c("center", "scale"))
KClusteringDF_Preprocess <- predict(Norm_Kmeans, KClusteringDF)
# Separate Benign and Malign into 2 Datasets
Benign_ClusterDF <- KClusteringDF_Preprocess[KClusteringDF_Preprocess$diagnosis == 0,]
Malign_ClusterDF <- KClusteringDF_Preprocess[KClusteringDF_Preprocess$diagnosis == 1,]
# Without Preprocess for Malign DF
Malign_ClusterDF_No_Scale <- KClusteringDF[KClusteringDF$diagnosis == 1,]
Let’s check if accounting for the whole dataset, we can find meaningful clusters.
set.seed(1)
# Load Library
library(factoextra)
# Labeling Tumors Type as Row Name
KClusteringDF_Preprocess$diagnosis <- factor(KClusteringDF_Preprocess$diagnosis, levels = c(0,1), labels=c("Benign","Malign"))
rownames(KClusteringDF_Preprocess) <- paste(KClusteringDF_Preprocess$diagnosis, 1:dim(KClusteringDF_Preprocess)[1], sep = "_")
# Optimal Number of Clusters
fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "wss")
fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "silhouette")
fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "gap_stat")
# Create Clusters
Cluster_ALL <- kmeans(KClusteringDF_Preprocess[,-1], centers = 2, iter.max = 100, nstart = 100)
Comments: We can find the optimal number of clusters with 3 differents methods: WSS ( Within-Cluster-Sum of Squared Errors) or also called Elbow Method, The Silhouette Method which accounts for the separation between clusters or lastly the Gap Statistic. Here all three methods give us the optimal number of 2 clusters, which make sense when taking into accounts the fact that there is either Benign or Malign type of tumors. Let’s see it graphically and how it performs with the real word.
set.seed(1)
# Plotting Clusters of Model 1
fviz_cluster(Cluster_ALL, data = KClusteringDF_Preprocess[,-1], main="Cluster Model 1", labelsize = 0)+ geom_text(
label=rownames(KClusteringDF_Preprocess),
nudge_x = 0.25, nudge_y = 0.25,
check_overlap = T, size=2)
Comments: We can see that K-Means without any human intervention found 2 clusters to be optimal on the whole dataset, and separated benign and malign tumors accordingly. We could check how it performed. Here Cluster Number 1 would be the Malign Tumors and Cluser Number 2 the Benign Tumors.
set.seed(1)
# Let's Convert some results to comparable clusters and values
DF_Cluster_Performance <- as.data.frame(Cluster_ALL$cluster)
DF_Cluster_Performance <- cbind(DF_Cluster_Performance, KClusteringDF_Preprocess$diagnosis)
rownames(DF_Cluster_Performance) <- c(1:dim(DF_Cluster_Performance))
## Warning in 1:dim(DF_Cluster_Performance): l'expression numérique a 2 éléments :
## seul le premier est utilisé
DF_Cluster_Performance$`Cluster_ALL$cluster` <- ifelse(DF_Cluster_Performance$`Cluster_ALL$cluster` == 1, 1,0)
DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` <- ifelse(DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` == "Malign",1,0)
# Convert as Factor the Binary Outcomes
DF_Cluster_Performance$`Cluster_ALL$cluster` <- factor(DF_Cluster_Performance$`Cluster_ALL$cluster`)
DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` <- factor(DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis`)
# Confusion Matrix
Confusion_Matrix_K_Means1 <- confusionMatrix(data = DF_Cluster_Performance$`Cluster_ALL$cluster`, reference = DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis`,positive = "1")
# Create the Function for Confusion Matrix
draw_confusion_matrix_K_Means1 <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for K-Means - Model 1', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_K_Means1(Confusion_Matrix_K_Means1)
Comments: We can appreciate the K-Means algorithm to have found 2 types of differents tumours in our dataset (Malign and Benign), showing that we do have good seperability from our features. The Accuracy is not quite good, again the clustering is not aimed at being good in predictions but rather show some insights about the dataset mixed with some knowledge in the field.
Since Cancerous Tumors in the Breasts are not all equal, some of them being at different stages or type, we could apply the K-Means Clustering Model to find out if there is some separations among them and if could suggest an priory number of cluster to further analysis for medical researches.
Types of Breast Cancer - American Society
For examples, we can find this article about how stages are rated:
“In both staging systems, 7 key pieces of information are used:
We can see that we lack a lot of information only using this dataset, we could only infer the size of the cancer based on 1 tumor, without nearby information. Thus we will be limited in the clustering method to only size as an information for the stage of the tumor.
set.seed(1)
# Load Library
library(factoextra)
# Labeling Tumors Type as Row Name
Malign_ClusterDF$diagnosis <- factor(Malign_ClusterDF$diagnosis)
# Optimal Number of Clusters
fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "wss")
fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "silhouette")
fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "gap_stat")
# Create Clusters
Cluster_Malign_1 <- kmeans(Malign_ClusterDF[,-1], centers = 3, iter.max = 100, nstart = 100)
Cluster_Malign_2 <- kmeans(Malign_ClusterDF[,-1], centers = 2, iter.max = 100, nstart = 100)
Comments: All 3 methods don’t converge to the same number of clusters, but we can see that the Elbow Method and Silhouette Method would either say 2 or 3 groups being optimal. The Gap Statistic show no cluster separations from the Malign tumors. We can try separating into 2 and 3 clusters and see the profiling of those groups. (For simplictiy, we only compare mean variables as meaningful measures to intepret our tumors.)
set.seed(1)
# Plotting Clusters of Model 1
fviz_cluster(Cluster_Malign_1, data = Malign_ClusterDF[,-1], main="Cluster Model 2 - Only Malign Tumors", subtitle="with 3 Clusters", labelsize = 0)
Comments: Some overlap occurs in this 2D graphs, but considering all dimensions, there is no overlap at all. We can see that we could with human interpretation, see that there would be indeed 3 different clusters in the Malign Tumors. Let’s check the centroid.
Clusters_Malign_1_Centers <- Cluster_Malign_1$centers
DT::datatable(round(Clusters_Malign_1_Centers,5), caption = "Centroid from Model 2 - 3 Clusters")
Comments: We can see that the radius_mean (perimeter_mean and area_mean are quite similar) is indeed being one variable cleary seperating the Malign Tumors into 3 clusters, concavity_mean and compactness_mean as well. smoothness_mean is also seperating clusters from each other. (For simplictiy, we only compare mean variables as meaningful measures to intepret our tumors.)
# Cluster Members
split1<- split(Malign_ClusterDF_No_Scale, Cluster_Malign_1$cluster)
# Split Cluster to Original Malign Not Scaled DF
cluster_1 <- split1$`1`
cluster_2 <- split1$`2`
cluster_3 <- split1$`3`
# Data Table
DT::datatable(cluster_1[,-1], caption = "Cluster 1")
DT::datatable(cluster_2[,-1], caption = "Cluster 2 ")
DT::datatable(cluster_3[,-1], caption = "Cluster 3")
# Load Libaries
library(ggpubr)
library(ggplot2)
# Boxplot with ggplot
boxcluster1 <- ggplot(cluster_1) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 1",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
boxcluster2 <- ggplot(cluster_2) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 2",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
boxcluster3 <- ggplot(cluster_3) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 3",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
ggarrange1 <- ggarrange(boxcluster1,boxcluster2,boxcluster3, ncol = 3)
annotate_figure(ggarrange1,
top = text_grob("Boxplot for radius_mean Among Clusters", color = "black", face = "bold", size = 14))
Comments: We can see some tumors being greater than 20mm or between 15mm and 20mm, and lastly under 15mm. Such result in the clusters median is very intersting and knowing how the staging system is done can actually lead us to prefer the clustering into 2 groups: 1 cluster being below 20mm and the other greater or equal than 20mm. We will do as such in the following part.
set.seed(1)
# Plotting Clusters of Model 1
fviz_cluster(Cluster_Malign_2, data = Malign_ClusterDF[,-1], main="Cluster Model 2 - Only Malign Tumors", subtitle="with 2 Clusters", labelsize = 0)
Comments: Overlapping still occurs in such 2d graphs, but we can see also the trend of 2 groups, the Red one being more spread than the blue one, and some spread also happen in the bottom center of the plot for the blue cluster.
Clusters_Malign_2_Centers <- Cluster_Malign_2$centers
DT::datatable(round(Clusters_Malign_2_Centers,5), caption = "Centroid from Model 2 - 2 Clusters")
Comments: We can also see the radius_mean (perimeter_mean and area_mean) being very important in the separation, compactness_mean and concavity_mean as well. symmetry_mean is also quite different and smoothness_mean as well.
# Cluster Members
split2<- split(Malign_ClusterDF_No_Scale, Cluster_Malign_2$cluster)
# Split Cluster to Original Malign Not Scaled DF
cluster2_1 <- split2$`1`
cluster2_2 <- split2$`2`
# Data Table
DT::datatable(cluster2_1[,-1], caption = "Cluster 1")
DT::datatable(cluster2_2[,-1], caption = "Cluster 2")
# Load Libaries
library(ggpubr)
library(ggplot2)
# Boxplot with ggplot
boxcluster1 <- ggplot(cluster_1) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 1 (T2)",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
boxcluster2 <- ggplot(cluster_2) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 2 (T1)",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
ggarrange2 <- ggarrange(boxcluster1,boxcluster2)
annotate_figure(ggarrange2,
top = text_grob("Boxplot for radius_mean Among Clusters", color = "black", face = "bold", size = 14))
Comments: without having the full key pieces information for the staging systems from the American Cancer Society, we can already have some metrics for the T key which is the size of the tumor, but without the nearby areas. The dataset suggest that the measure are for primary tumors only. If we look at the Cluster 1, the radius_mean median seems to be higher than 2cm or 20mm but less than 5cm or 50mm. Thus we would attribute the T2 key to this Cluster. Cluster 2 in opposite is having an median close to 1.4cm or 14mm, since this is less than 2cm ro 20mm, we could attribute the key T1 to this cluster. Nevertheless, we should remember that some member of Cluster 1 are less than 20mm, and thus we shouldn’t categorize them as T2 following the guidelines. For simplicity, we will keep those tumors in the Cluster 1 but if we wanted to decide or not wether a member is subject to T2, we should use other metrics to check the size exactitude before removing it to the T2 label.
Cluster 2 with T1 could potential lead us to such Stages:
Stage IA: The tumor is small, invasive, and has not spread to the lymph nodes (T1, N0, M0). Stage IB: Cancer has spread to the lymph nodes and the cancer in the lymph node is larger than 0.2 mm but less than 2 mm in size. There is either no evidence of a tumor in the breast or the tumor in the breast is 20 mm or smaller (T0 or T1, N1mi, M0). Stage IIIC: A tumor of any size that has spread to 10 or more axillary lymph nodes, the internal mammary lymph nodes, and/or the lymph nodes under the collarbone. It has not spread to other parts of the body (any T, N3, M0). Stage IV (metastatic): The tumor can be any size and has spread to other organs, such as the bones, lungs, brain, liver, distant lymph nodes, or chest wall (any T, any N, M1). Metastatic cancer found when the cancer is first diagnosed occurs about 6% of the time. This may be called de novo metastatic breast cancer. Most commonly, metastatic breast cancer is found after a previous diagnosis of early stage breast cancer.
Cluster 1 with T2 could potential lead us to such Stages:
Stage IIA: Any 1 of these conditions: The tumor is larger than 20 mm but not larger than 50 mm and has not spread to the axillary lymph nodes (T2, N0, M0). Stage IIB: The tumor is larger than 20 mm but not larger than 50 mm and has spread to 1 to 3 axillary lymph nodes (T2, N1, M0). Stage IIIA: The tumor of any size has spread to 4 to 9 axillary lymph nodes or to internal mammary lymph nodes. It has not spread to other parts of the body (T0, T1, T2, or T3; N2; M0). Stage IIIC: A tumor of any size that has spread to 10 or more axillary lymph nodes, the internal mammary lymph nodes, and/or the lymph nodes under the collarbone. It has not spread to other parts of the body (any T, N3, M0). Stage IV (metastatic): The tumor can be any size and has spread to other organs, such as the bones, lungs, brain, liver, distant lymph nodes, or chest wall (any T, any N, M1). Metastatic cancer found when the cancer is first diagnosed occurs about 6% of the time. This may be called de novo metastatic breast cancer. Most commonly, metastatic breast cancer is found after a previous diagnosis of early stage breast cancer.
Comments: We can see that we lack a lot of information to actually get to the actual stage of the cancerous breast tumors, depending on the source, we may lack 2 more information if we follow Cancer.Net staging system: Node (N - Has the tumor spread to the lymph nodes? If so, where, what size, and how many?) or Metastasis (M - Has the cancer spread to other parts of the body?). The American Cancer Society requires way more information, up to 7 in total plus additional recurrence test. Here is the 7 keys parameters:
# Computing Proportions of T1 and T2
Proportions_T1 <- nrow(cluster2_2)/nrow(Malign_ClusterDF_No_Scale)
Proportions_T2 <- nrow(cluster2_1)/nrow(Malign_ClusterDF_No_Scale)
# Rounding Proportions
Proportions_T1 <- round(Proportions_T1,3)
Proportions_T2 <- round(Proportions_T2,3)
# Pie Chart Dataframe
Pie_T1_T2 <- data.frame(
t = c("T1", "T2"),
n = c(151, 61),
prop = c(Proportions_T1, Proportions_T2))
# Pie ggplot
ggplot(Pie_T1_T2, aes(x="", y=n, fill=t)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) + theme_void() + geom_text(aes(label = paste0(100*prop, "%")), position = position_stack(vjust=0.5), color="white", size=6) +
labs(x = NULL, y = NULL, fill = "T Category") + scale_fill_manual(values=c("#1c6155","#66807b")) + ggtitle("Pie Chart of T Category Proportions for Malign Tumors (221 obs.)")
Comments:
Logistic Regression in Machine Learning
Convergence Error in Logistic Regression
Penalized Logistic Regression Essentials in R: Ridge, Lasso and Elastic Net
Lasso Regression in R (Step-by-Step)
How to create a ROC curve in R
How to choose the number of hidden layers and nodes in a feedforward neural network?
Introduction to Neural Networks for Java (second edition) by Jeff Heaton - Google Books
Do we need to set training set and testing set for clustering?
K-means Clustering: Algorithm, Applications, Evaluation Methods, and Drawbacks
Types of Breast Cancer - American Society
Breast Cancer Stages - cancer.org
Breast Cancer: Stages - Cancer.Net